blob: 134ae29ed71fb6e13de1c42e83544ebf2c59a337 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
244unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000245 PyObject **errorHandler,const char *encoding, const char *reason,
246 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
247 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
248
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249static void
250raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300251 const char *encoding,
252 const Py_UNICODE *unicode, Py_ssize_t size,
253 Py_ssize_t startpos, Py_ssize_t endpos,
254 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000255
Christian Heimes190d79e2008-01-30 11:58:22 +0000256/* Same for linebreaks */
257static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000259/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000260/* 0x000B, * LINE TABULATION */
261/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000262/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000263 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000265/* 0x001C, * FILE SEPARATOR */
266/* 0x001D, * GROUP SEPARATOR */
267/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 1, 1, 1, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000282};
283
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300284/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
285 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000287PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000289#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 /* This is actually an illegal character, so it should
293 not be passed to unichr. */
294 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#endif
296}
297
Victor Stinner910337b2011-10-03 03:20:16 +0200298#ifdef Py_DEBUG
299static int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200300/* FIXME: use PyObject* type for op */
301_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
327 } else {
328 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
329
330 data = unicode->data.any;
331 if (kind == PyUnicode_WCHAR_KIND) {
332 assert(ascii->state.compact == 0);
333 assert(ascii->state.ascii == 0);
334 assert(ascii->state.ready == 0);
335 assert(ascii->wstr != NULL);
336 assert(data == NULL);
337 assert(compact->utf8 == NULL);
338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
339 }
340 else {
341 assert(kind == PyUnicode_1BYTE_KIND
342 || kind == PyUnicode_2BYTE_KIND
343 || kind == PyUnicode_4BYTE_KIND);
344 assert(ascii->state.compact == 0);
345 assert(ascii->state.ready == 1);
346 assert(data != NULL);
347 if (ascii->state.ascii) {
348 assert (compact->utf8 == data);
349 assert (compact->utf8_length == ascii->length);
350 }
351 else
352 assert (compact->utf8 != data);
353 }
354 }
355 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200356 if (
357#if SIZEOF_WCHAR_T == 2
358 kind == PyUnicode_2BYTE_KIND
359#else
360 kind == PyUnicode_4BYTE_KIND
361#endif
362 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200363 {
364 assert(ascii->wstr == data);
365 assert(compact->wstr_length == ascii->length);
366 } else
367 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200368 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200369
370 if (compact->utf8 == NULL)
371 assert(compact->utf8_length == 0);
372 if (ascii->wstr == NULL)
373 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200375 /* check that the best kind is used */
376 if (check_content && kind != PyUnicode_WCHAR_KIND)
377 {
378 Py_ssize_t i;
379 Py_UCS4 maxchar = 0;
380 void *data = PyUnicode_DATA(ascii);
381 for (i=0; i < ascii->length; i++)
382 {
383 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
388 if (ascii->state.ascii == 0)
389 assert(maxchar >= 128);
390 else
391 assert(maxchar < 128);
392 }
393 else if (kind == PyUnicode_2BYTE_KIND)
394 assert(maxchar >= 0x100);
395 else
396 assert(maxchar >= 0x10000);
397 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400398 return 1;
399}
Victor Stinner910337b2011-10-03 03:20:16 +0200400#endif
401
Thomas Wouters477c8d52006-05-27 19:21:47 +0000402/* --- Bloom Filters ----------------------------------------------------- */
403
404/* stuff to implement simple "bloom filters" for Unicode characters.
405 to keep things simple, we use a single bitmask, using the least 5
406 bits from each unicode characters as the bit index. */
407
408/* the linebreak mask is set up by Unicode_Init below */
409
Antoine Pitrouf068f942010-01-13 14:19:12 +0000410#if LONG_BIT >= 128
411#define BLOOM_WIDTH 128
412#elif LONG_BIT >= 64
413#define BLOOM_WIDTH 64
414#elif LONG_BIT >= 32
415#define BLOOM_WIDTH 32
416#else
417#error "LONG_BIT is smaller than 32"
418#endif
419
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420#define BLOOM_MASK unsigned long
421
422static BLOOM_MASK bloom_linebreak;
423
Antoine Pitrouf068f942010-01-13 14:19:12 +0000424#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
425#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000426
Benjamin Peterson29060642009-01-31 22:14:21 +0000427#define BLOOM_LINEBREAK(ch) \
428 ((ch) < 128U ? ascii_linebreak[(ch)] : \
429 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000430
Alexander Belopolsky40018472011-02-26 01:02:56 +0000431Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000433{
434 /* calculate simple bloom-style bitmask for a given unicode string */
435
Antoine Pitrouf068f942010-01-13 14:19:12 +0000436 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437 Py_ssize_t i;
438
439 mask = 0;
440 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442
443 return mask;
444}
445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200446#define BLOOM_MEMBER(mask, chr, str) \
447 (BLOOM(mask, chr) \
448 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450/* --- Unicode Object ----------------------------------------------------- */
451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200452static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200453fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454
455Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
456 Py_ssize_t size, Py_UCS4 ch,
457 int direction)
458{
459 /* like wcschr, but doesn't stop at NULL characters */
460 Py_ssize_t i;
461 if (direction == 1) {
462 for(i = 0; i < size; i++)
463 if (PyUnicode_READ(kind, s, i) == ch)
464 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
465 }
466 else {
467 for(i = size-1; i >= 0; i--)
468 if (PyUnicode_READ(kind, s, i) == ch)
469 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
470 }
471 return NULL;
472}
473
Victor Stinnerfe226c02011-10-03 03:52:20 +0200474static PyObject*
475resize_compact(PyObject *unicode, Py_ssize_t length)
476{
477 Py_ssize_t char_size;
478 Py_ssize_t struct_size;
479 Py_ssize_t new_size;
480 int share_wstr;
481
482 assert(PyUnicode_IS_READY(unicode));
483 char_size = PyUnicode_CHARACTER_SIZE(unicode);
484 if (PyUnicode_IS_COMPACT_ASCII(unicode))
485 struct_size = sizeof(PyASCIIObject);
486 else
487 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200488 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200489
490 _Py_DEC_REFTOTAL;
491 _Py_ForgetReference(unicode);
492
493 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
494 PyErr_NoMemory();
495 return NULL;
496 }
497 new_size = (struct_size + (length + 1) * char_size);
498
499 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
500 if (unicode == NULL) {
501 PyObject_Del(unicode);
502 PyErr_NoMemory();
503 return NULL;
504 }
505 _Py_NewReference(unicode);
506 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200507 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200509 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
510 _PyUnicode_WSTR_LENGTH(unicode) = length;
511 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200512 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
513 length, 0);
514 return unicode;
515}
516
Alexander Belopolsky40018472011-02-26 01:02:56 +0000517static int
Victor Stinner95663112011-10-04 01:03:50 +0200518resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519{
Victor Stinner95663112011-10-04 01:03:50 +0200520 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200521 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200522 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000523
Victor Stinner95663112011-10-04 01:03:50 +0200524 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200525
526 if (PyUnicode_IS_READY(unicode)) {
527 Py_ssize_t char_size;
528 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200529 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200530 void *data;
531
532 data = _PyUnicode_DATA_ANY(unicode);
533 assert(data != NULL);
534 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200535 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
536 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200537 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
538 {
539 PyObject_DEL(_PyUnicode_UTF8(unicode));
540 _PyUnicode_UTF8(unicode) = NULL;
541 _PyUnicode_UTF8_LENGTH(unicode) = 0;
542 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200543
544 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
545 PyErr_NoMemory();
546 return -1;
547 }
548 new_size = (length + 1) * char_size;
549
550 data = (PyObject *)PyObject_REALLOC(data, new_size);
551 if (data == NULL) {
552 PyErr_NoMemory();
553 return -1;
554 }
555 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200556 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200557 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200558 _PyUnicode_WSTR_LENGTH(unicode) = length;
559 }
560 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200561 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200562 _PyUnicode_UTF8_LENGTH(unicode) = length;
563 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564 _PyUnicode_LENGTH(unicode) = length;
565 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200566 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200567 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200568 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200570 }
Victor Stinner95663112011-10-04 01:03:50 +0200571 assert(_PyUnicode_WSTR(unicode) != NULL);
572
573 /* check for integer overflow */
574 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
575 PyErr_NoMemory();
576 return -1;
577 }
578 wstr = _PyUnicode_WSTR(unicode);
579 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
580 if (!wstr) {
581 PyErr_NoMemory();
582 return -1;
583 }
584 _PyUnicode_WSTR(unicode) = wstr;
585 _PyUnicode_WSTR(unicode)[length] = 0;
586 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200587 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 return 0;
589}
590
Victor Stinnerfe226c02011-10-03 03:52:20 +0200591static PyObject*
592resize_copy(PyObject *unicode, Py_ssize_t length)
593{
594 Py_ssize_t copy_length;
595 if (PyUnicode_IS_COMPACT(unicode)) {
596 PyObject *copy;
597 assert(PyUnicode_IS_READY(unicode));
598
599 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
600 if (copy == NULL)
601 return NULL;
602
603 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
604 if (PyUnicode_CopyCharacters(copy, 0,
605 unicode, 0,
606 copy_length) < 0)
607 {
608 Py_DECREF(copy);
609 return NULL;
610 }
611 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200612 }
613 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200614 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615 assert(_PyUnicode_WSTR(unicode) != NULL);
616 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 if (w == NULL)
619 return NULL;
620 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
621 copy_length = Py_MIN(copy_length, length);
622 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
623 copy_length);
624 return (PyObject*)w;
625 }
626}
627
Guido van Rossumd57fd912000-03-10 22:53:23 +0000628/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000629 Ux0000 terminated; some code (e.g. new_identifier)
630 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631
632 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000633 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635*/
636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637#ifdef Py_DEBUG
638int unicode_old_new_calls = 0;
639#endif
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641static PyUnicodeObject *
642_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646
Thomas Wouters477c8d52006-05-27 19:21:47 +0000647 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648 if (length == 0 && unicode_empty != NULL) {
649 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200650 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000653 /* Ensure we won't overflow the size. */
654 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
655 return (PyUnicodeObject *)PyErr_NoMemory();
656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 if (length < 0) {
658 PyErr_SetString(PyExc_SystemError,
659 "Negative size passed to _PyUnicode_New");
660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663#ifdef Py_DEBUG
664 ++unicode_old_new_calls;
665#endif
666
667 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
668 if (unicode == NULL)
669 return NULL;
670 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
671 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
672 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 PyErr_NoMemory();
674 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676
Jeremy Hyltond8082792003-09-16 19:41:39 +0000677 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000678 * the caller fails before initializing str -- unicode_resize()
679 * reads str[0], and the Keep-Alive optimization can keep memory
680 * allocated for str alive across a call to unicode_dealloc(unicode).
681 * We don't want unicode_resize to read uninitialized memory in
682 * that case.
683 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200684 _PyUnicode_WSTR(unicode)[0] = 0;
685 _PyUnicode_WSTR(unicode)[length] = 0;
686 _PyUnicode_WSTR_LENGTH(unicode) = length;
687 _PyUnicode_HASH(unicode) = -1;
688 _PyUnicode_STATE(unicode).interned = 0;
689 _PyUnicode_STATE(unicode).kind = 0;
690 _PyUnicode_STATE(unicode).compact = 0;
691 _PyUnicode_STATE(unicode).ready = 0;
692 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200693 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200694 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200695 _PyUnicode_UTF8(unicode) = NULL;
696 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000698
Benjamin Peterson29060642009-01-31 22:14:21 +0000699 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000700 /* XXX UNREF/NEWREF interface should be more symmetrical */
701 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000702 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000703 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705}
706
Victor Stinnerf42dc442011-10-02 23:33:16 +0200707static const char*
708unicode_kind_name(PyObject *unicode)
709{
Victor Stinner42dfd712011-10-03 14:41:45 +0200710 /* don't check consistency: unicode_kind_name() is called from
711 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200712 if (!PyUnicode_IS_COMPACT(unicode))
713 {
714 if (!PyUnicode_IS_READY(unicode))
715 return "wstr";
716 switch(PyUnicode_KIND(unicode))
717 {
718 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200719 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200720 return "legacy ascii";
721 else
722 return "legacy latin1";
723 case PyUnicode_2BYTE_KIND:
724 return "legacy UCS2";
725 case PyUnicode_4BYTE_KIND:
726 return "legacy UCS4";
727 default:
728 return "<legacy invalid kind>";
729 }
730 }
731 assert(PyUnicode_IS_READY(unicode));
732 switch(PyUnicode_KIND(unicode))
733 {
734 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200735 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200736 return "ascii";
737 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200740 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200741 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200742 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200743 default:
744 return "<invalid compact kind>";
745 }
746}
747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748#ifdef Py_DEBUG
749int unicode_new_new_calls = 0;
750
751/* Functions wrapping macros for use in debugger */
752char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200753 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754}
755
756void *_PyUnicode_compact_data(void *unicode) {
757 return _PyUnicode_COMPACT_DATA(unicode);
758}
759void *_PyUnicode_data(void *unicode){
760 printf("obj %p\n", unicode);
761 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
762 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
763 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
764 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
765 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
766 return PyUnicode_DATA(unicode);
767}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200768
769void
770_PyUnicode_Dump(PyObject *op)
771{
772 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200773 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
774 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
775 void *data;
776 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
777 if (ascii->state.compact)
778 data = (compact + 1);
779 else
780 data = unicode->data.any;
781 if (ascii->wstr == data)
782 printf("shared ");
783 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200784 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200785 printf(" (%zu), ", compact->wstr_length);
786 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
787 printf("shared ");
788 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200789 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200790 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200791}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200792#endif
793
794PyObject *
795PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
796{
797 PyObject *obj;
798 PyCompactUnicodeObject *unicode;
799 void *data;
800 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200801 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 Py_ssize_t char_size;
803 Py_ssize_t struct_size;
804
805 /* Optimization for empty strings */
806 if (size == 0 && unicode_empty != NULL) {
807 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200808 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 }
810
811#ifdef Py_DEBUG
812 ++unicode_new_new_calls;
813#endif
814
Victor Stinner9e9d6892011-10-04 01:02:02 +0200815 is_ascii = 0;
816 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 struct_size = sizeof(PyCompactUnicodeObject);
818 if (maxchar < 128) {
819 kind_state = PyUnicode_1BYTE_KIND;
820 char_size = 1;
821 is_ascii = 1;
822 struct_size = sizeof(PyASCIIObject);
823 }
824 else if (maxchar < 256) {
825 kind_state = PyUnicode_1BYTE_KIND;
826 char_size = 1;
827 }
828 else if (maxchar < 65536) {
829 kind_state = PyUnicode_2BYTE_KIND;
830 char_size = 2;
831 if (sizeof(wchar_t) == 2)
832 is_sharing = 1;
833 }
834 else {
835 kind_state = PyUnicode_4BYTE_KIND;
836 char_size = 4;
837 if (sizeof(wchar_t) == 4)
838 is_sharing = 1;
839 }
840
841 /* Ensure we won't overflow the size. */
842 if (size < 0) {
843 PyErr_SetString(PyExc_SystemError,
844 "Negative size passed to PyUnicode_New");
845 return NULL;
846 }
847 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
848 return PyErr_NoMemory();
849
850 /* Duplicated allocation code from _PyObject_New() instead of a call to
851 * PyObject_New() so we are able to allocate space for the object and
852 * it's data buffer.
853 */
854 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
855 if (obj == NULL)
856 return PyErr_NoMemory();
857 obj = PyObject_INIT(obj, &PyUnicode_Type);
858 if (obj == NULL)
859 return NULL;
860
861 unicode = (PyCompactUnicodeObject *)obj;
862 if (is_ascii)
863 data = ((PyASCIIObject*)obj) + 1;
864 else
865 data = unicode + 1;
866 _PyUnicode_LENGTH(unicode) = size;
867 _PyUnicode_HASH(unicode) = -1;
868 _PyUnicode_STATE(unicode).interned = 0;
869 _PyUnicode_STATE(unicode).kind = kind_state;
870 _PyUnicode_STATE(unicode).compact = 1;
871 _PyUnicode_STATE(unicode).ready = 1;
872 _PyUnicode_STATE(unicode).ascii = is_ascii;
873 if (is_ascii) {
874 ((char*)data)[size] = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 else if (kind_state == PyUnicode_1BYTE_KIND) {
878 ((char*)data)[size] = 0;
879 _PyUnicode_WSTR(unicode) = NULL;
880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200882 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 }
884 else {
885 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200886 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 if (kind_state == PyUnicode_2BYTE_KIND)
888 ((Py_UCS2*)data)[size] = 0;
889 else /* kind_state == PyUnicode_4BYTE_KIND */
890 ((Py_UCS4*)data)[size] = 0;
891 if (is_sharing) {
892 _PyUnicode_WSTR_LENGTH(unicode) = size;
893 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
894 }
895 else {
896 _PyUnicode_WSTR_LENGTH(unicode) = 0;
897 _PyUnicode_WSTR(unicode) = NULL;
898 }
899 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200900 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901 return obj;
902}
903
904#if SIZEOF_WCHAR_T == 2
905/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
906 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200907 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909 This function assumes that unicode can hold one more code point than wstr
910 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200911static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
913 PyUnicodeObject *unicode)
914{
915 const wchar_t *iter;
916 Py_UCS4 *ucs4_out;
917
Victor Stinner910337b2011-10-03 03:20:16 +0200918 assert(unicode != NULL);
919 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
921 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
922
923 for (iter = begin; iter < end; ) {
924 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
925 _PyUnicode_GET_LENGTH(unicode)));
926 if (*iter >= 0xD800 && *iter <= 0xDBFF
927 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
928 {
929 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
930 iter += 2;
931 }
932 else {
933 *ucs4_out++ = *iter;
934 iter++;
935 }
936 }
937 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
938 _PyUnicode_GET_LENGTH(unicode)));
939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940}
941#endif
942
Victor Stinnercd9950f2011-10-02 00:34:53 +0200943static int
944_PyUnicode_Dirty(PyObject *unicode)
945{
Victor Stinner910337b2011-10-03 03:20:16 +0200946 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200947 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200948 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200949 "Cannot modify a string having more than 1 reference");
950 return -1;
951 }
952 _PyUnicode_DIRTY(unicode);
953 return 0;
954}
955
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200956Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
958 PyObject *from, Py_ssize_t from_start,
959 Py_ssize_t how_many)
960{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200961 unsigned int from_kind, to_kind;
962 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963
Victor Stinnerb1536152011-09-30 02:26:10 +0200964 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
965 PyErr_BadInternalCall();
966 return -1;
967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968
969 if (PyUnicode_READY(from))
970 return -1;
971 if (PyUnicode_READY(to))
972 return -1;
973
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200974 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200976 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 "Cannot write %zi characters at %zi "
978 "in a string of %zi characters",
979 how_many, to_start, PyUnicode_GET_LENGTH(to));
980 return -1;
981 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200982 if (how_many == 0)
983 return 0;
984
Victor Stinnercd9950f2011-10-02 00:34:53 +0200985 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200986 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200989 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200991 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992
Victor Stinnerf42dc442011-10-02 23:33:16 +0200993 if (from_kind == to_kind
994 /* deny latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +0200995 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200996 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200997 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200998 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200999 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001000 + PyUnicode_KIND_SIZE(from_kind, from_start),
1001 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001003 else if (from_kind == PyUnicode_1BYTE_KIND
1004 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001005 {
1006 _PyUnicode_CONVERT_BYTES(
1007 Py_UCS1, Py_UCS2,
1008 PyUnicode_1BYTE_DATA(from) + from_start,
1009 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1010 PyUnicode_2BYTE_DATA(to) + to_start
1011 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001012 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001013 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001014 && to_kind == PyUnicode_4BYTE_KIND)
1015 {
1016 _PyUnicode_CONVERT_BYTES(
1017 Py_UCS1, Py_UCS4,
1018 PyUnicode_1BYTE_DATA(from) + from_start,
1019 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1020 PyUnicode_4BYTE_DATA(to) + to_start
1021 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001022 }
1023 else if (from_kind == PyUnicode_2BYTE_KIND
1024 && to_kind == PyUnicode_4BYTE_KIND)
1025 {
1026 _PyUnicode_CONVERT_BYTES(
1027 Py_UCS2, Py_UCS4,
1028 PyUnicode_2BYTE_DATA(from) + from_start,
1029 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1030 PyUnicode_4BYTE_DATA(to) + to_start
1031 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001032 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001033 else {
1034 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001035
1036 /* check if max_char(from substring) <= max_char(to) */
1037 if (from_kind > to_kind
1038 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001039 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001040 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001041 /* slow path to check for character overflow */
1042 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1043 Py_UCS4 ch, maxchar;
1044 Py_ssize_t i;
1045
1046 maxchar = 0;
1047 invalid_kinds = 0;
1048 for (i=0; i < how_many; i++) {
1049 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1050 if (ch > maxchar) {
1051 maxchar = ch;
1052 if (maxchar > to_maxchar) {
1053 invalid_kinds = 1;
1054 break;
1055 }
1056 }
1057 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1058 }
1059 }
1060 else
1061 invalid_kinds = 1;
1062 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001063 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001064 "Cannot copy %s characters "
1065 "into a string of %s characters",
1066 unicode_kind_name(from),
1067 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001068 return -1;
1069 }
1070 }
1071 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072}
1073
Victor Stinner17222162011-09-28 22:15:37 +02001074/* Find the maximum code point and count the number of surrogate pairs so a
1075 correct string length can be computed before converting a string to UCS4.
1076 This function counts single surrogates as a character and not as a pair.
1077
1078 Return 0 on success, or -1 on error. */
1079static int
1080find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1081 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082{
1083 const wchar_t *iter;
1084
Victor Stinnerc53be962011-10-02 21:33:54 +02001085 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 *num_surrogates = 0;
1087 *maxchar = 0;
1088
1089 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001090 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001092#if SIZEOF_WCHAR_T != 2
1093 if (*maxchar >= 0x10000)
1094 return 0;
1095#endif
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097#if SIZEOF_WCHAR_T == 2
1098 if (*iter >= 0xD800 && *iter <= 0xDBFF
1099 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1100 {
1101 Py_UCS4 surrogate_val;
1102 surrogate_val = (((iter[0] & 0x3FF)<<10)
1103 | (iter[1] & 0x3FF)) + 0x10000;
1104 ++(*num_surrogates);
1105 if (surrogate_val > *maxchar)
1106 *maxchar = surrogate_val;
1107 iter += 2;
1108 }
1109 else
1110 iter++;
1111#else
1112 iter++;
1113#endif
1114 }
1115 return 0;
1116}
1117
1118#ifdef Py_DEBUG
1119int unicode_ready_calls = 0;
1120#endif
1121
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001122static int
1123unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001125 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 wchar_t *end;
1127 Py_UCS4 maxchar = 0;
1128 Py_ssize_t num_surrogates;
1129#if SIZEOF_WCHAR_T == 2
1130 Py_ssize_t length_wo_surrogates;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133 assert(p_obj != NULL);
1134 unicode = (PyUnicodeObject *)*p_obj;
1135
Georg Brandl7597add2011-10-05 16:36:47 +02001136 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001137 strings were created using _PyObject_New() and where no canonical
1138 representation (the str field) has been set yet aka strings
1139 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001140 assert(_PyUnicode_CHECK(unicode));
1141 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001143 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001144 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001145 /* Actually, it should neither be interned nor be anything else: */
1146 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
1148#ifdef Py_DEBUG
1149 ++unicode_ready_calls;
1150#endif
1151
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001152#ifdef Py_DEBUG
1153 assert(!replace || Py_REFCNT(unicode) == 1);
1154#else
1155 if (replace && Py_REFCNT(unicode) != 1)
1156 replace = 0;
1157#endif
1158 if (replace) {
1159 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1160 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1161 /* Optimization for empty strings */
1162 if (len == 0) {
1163 Py_INCREF(unicode_empty);
1164 Py_DECREF(*p_obj);
1165 *p_obj = unicode_empty;
1166 return 0;
1167 }
1168 if (len == 1 && wstr[0] < 256) {
1169 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1170 if (latin1_char == NULL)
1171 return -1;
1172 Py_DECREF(*p_obj);
1173 *p_obj = latin1_char;
1174 return 0;
1175 }
1176 }
1177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001179 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001180 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182
1183 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001184 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1185 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186 PyErr_NoMemory();
1187 return -1;
1188 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001189 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 _PyUnicode_WSTR(unicode), end,
1191 PyUnicode_1BYTE_DATA(unicode));
1192 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1193 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1194 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1195 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001196 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001197 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001198 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199 }
1200 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204 }
1205 PyObject_FREE(_PyUnicode_WSTR(unicode));
1206 _PyUnicode_WSTR(unicode) = NULL;
1207 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1208 }
1209 /* In this case we might have to convert down from 4-byte native
1210 wchar_t to 2-byte unicode. */
1211 else if (maxchar < 65536) {
1212 assert(num_surrogates == 0 &&
1213 "FindMaxCharAndNumSurrogatePairs() messed up");
1214
Victor Stinner506f5922011-09-28 22:34:18 +02001215#if SIZEOF_WCHAR_T == 2
1216 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001217 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001218 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1219 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1220 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001221 _PyUnicode_UTF8(unicode) = NULL;
1222 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001223#else
1224 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001225 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001226 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001227 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001228 PyErr_NoMemory();
1229 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230 }
Victor Stinner506f5922011-09-28 22:34:18 +02001231 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1232 _PyUnicode_WSTR(unicode), end,
1233 PyUnicode_2BYTE_DATA(unicode));
1234 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1235 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1236 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001237 _PyUnicode_UTF8(unicode) = NULL;
1238 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001239 PyObject_FREE(_PyUnicode_WSTR(unicode));
1240 _PyUnicode_WSTR(unicode) = NULL;
1241 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1242#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 }
1244 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1245 else {
1246#if SIZEOF_WCHAR_T == 2
1247 /* in case the native representation is 2-bytes, we need to allocate a
1248 new normalized 4-byte version. */
1249 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001250 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1251 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 PyErr_NoMemory();
1253 return -1;
1254 }
1255 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1256 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001257 _PyUnicode_UTF8(unicode) = NULL;
1258 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001259 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1260 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001261 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 PyObject_FREE(_PyUnicode_WSTR(unicode));
1263 _PyUnicode_WSTR(unicode) = NULL;
1264 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1265#else
1266 assert(num_surrogates == 0);
1267
Victor Stinnerc3c74152011-10-02 20:39:55 +02001268 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001270 _PyUnicode_UTF8(unicode) = NULL;
1271 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1273#endif
1274 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1275 }
1276 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001277 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 return 0;
1279}
1280
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001281int
1282_PyUnicode_ReadyReplace(PyObject **op)
1283{
1284 return unicode_ready(op, 1);
1285}
1286
1287int
1288_PyUnicode_Ready(PyObject *op)
1289{
1290 return unicode_ready(&op, 0);
1291}
1292
Alexander Belopolsky40018472011-02-26 01:02:56 +00001293static void
1294unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295{
Walter Dörwald16807132007-05-25 13:52:07 +00001296 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 case SSTATE_NOT_INTERNED:
1298 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001299
Benjamin Peterson29060642009-01-31 22:14:21 +00001300 case SSTATE_INTERNED_MORTAL:
1301 /* revive dead object temporarily for DelItem */
1302 Py_REFCNT(unicode) = 3;
1303 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1304 Py_FatalError(
1305 "deletion of interned string failed");
1306 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001307
Benjamin Peterson29060642009-01-31 22:14:21 +00001308 case SSTATE_INTERNED_IMMORTAL:
1309 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001310
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 default:
1312 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001313 }
1314
Victor Stinner03490912011-10-03 23:45:12 +02001315 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001317 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001318 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319
1320 if (PyUnicode_IS_COMPACT(unicode)) {
1321 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 }
1323 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001324 if (_PyUnicode_DATA_ANY(unicode))
1325 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001326 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 }
1328}
1329
Alexander Belopolsky40018472011-02-26 01:02:56 +00001330static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001331unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001332{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001333 if (Py_REFCNT(unicode) != 1)
1334 return 0;
1335 if (PyUnicode_CHECK_INTERNED(unicode))
1336 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001337 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001338#ifdef Py_DEBUG
1339 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1340 && PyUnicode_GET_LENGTH(unicode) == 1)
1341 {
1342 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001343 if (ch < 256 && unicode_latin1[ch] == unicode)
1344 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001345 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001346#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001347 return 1;
1348}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350static int
1351unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1352{
1353 PyObject *unicode;
1354 Py_ssize_t old_length;
1355
1356 assert(p_unicode != NULL);
1357 unicode = *p_unicode;
1358
1359 assert(unicode != NULL);
1360 assert(PyUnicode_Check(unicode));
1361 assert(0 <= length);
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 old_length = PyUnicode_WSTR_LENGTH(unicode);
1365 else
1366 old_length = PyUnicode_GET_LENGTH(unicode);
1367 if (old_length == length)
1368 return 0;
1369
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370 if (!unicode_resizable(unicode)) {
1371 PyObject *copy = resize_copy(unicode, length);
1372 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001373 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001374 Py_DECREF(*p_unicode);
1375 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001377 }
1378
Victor Stinnerfe226c02011-10-03 03:52:20 +02001379 if (PyUnicode_IS_COMPACT(unicode)) {
1380 *p_unicode = resize_compact(unicode, length);
1381 if (*p_unicode == NULL)
1382 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001383 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001384 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001385 }
1386 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001387}
1388
Alexander Belopolsky40018472011-02-26 01:02:56 +00001389int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001390PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001391{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001392 PyObject *unicode;
1393 if (p_unicode == NULL) {
1394 PyErr_BadInternalCall();
1395 return -1;
1396 }
1397 unicode = *p_unicode;
1398 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1399 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1400 {
1401 PyErr_BadInternalCall();
1402 return -1;
1403 }
1404 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001405}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407static PyObject*
1408get_latin1_char(unsigned char ch)
1409{
Victor Stinnera464fc12011-10-02 20:39:30 +02001410 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001412 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 if (!unicode)
1414 return NULL;
1415 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001416 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 unicode_latin1[ch] = unicode;
1418 }
1419 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001420 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421}
1422
Alexander Belopolsky40018472011-02-26 01:02:56 +00001423PyObject *
1424PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425{
1426 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 Py_UCS4 maxchar = 0;
1428 Py_ssize_t num_surrogates;
1429
1430 if (u == NULL)
1431 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001433 /* If the Unicode data is known at construction time, we can apply
1434 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 /* Optimization for empty strings */
1437 if (size == 0 && unicode_empty != NULL) {
1438 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001439 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001440 }
Tim Petersced69f82003-09-16 20:30:58 +00001441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 /* Single character Unicode objects in the Latin-1 range are
1443 shared when using this constructor */
1444 if (size == 1 && *u < 256)
1445 return get_latin1_char((unsigned char)*u);
1446
1447 /* If not empty and not single character, copy the Unicode data
1448 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001449 if (find_maxchar_surrogates(u, u + size,
1450 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451 return NULL;
1452
1453 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1454 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 if (!unicode)
1456 return NULL;
1457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 switch (PyUnicode_KIND(unicode)) {
1459 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001460 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1462 break;
1463 case PyUnicode_2BYTE_KIND:
1464#if Py_UNICODE_SIZE == 2
1465 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1466#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001467 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1469#endif
1470 break;
1471 case PyUnicode_4BYTE_KIND:
1472#if SIZEOF_WCHAR_T == 2
1473 /* This is the only case which has to process surrogates, thus
1474 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001475 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476#else
1477 assert(num_surrogates == 0);
1478 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1479#endif
1480 break;
1481 default:
1482 assert(0 && "Impossible state");
1483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001485 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 return (PyObject *)unicode;
1487}
1488
Alexander Belopolsky40018472011-02-26 01:02:56 +00001489PyObject *
1490PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001491{
1492 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001493
Benjamin Peterson14339b62009-01-31 16:36:08 +00001494 if (size < 0) {
1495 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001497 return NULL;
1498 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001499
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001500 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001501 some optimizations which share commonly used objects.
1502 Also, this means the input must be UTF-8, so fall back to the
1503 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001504 if (u != NULL) {
1505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 /* Optimization for empty strings */
1507 if (size == 0 && unicode_empty != NULL) {
1508 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001509 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001511
1512 /* Single characters are shared when using this constructor.
1513 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514 if (size == 1 && Py_CHARMASK(*u) < 128)
1515 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001516
1517 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001518 }
1519
Walter Dörwald55507312007-05-18 13:12:10 +00001520 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001521 if (!unicode)
1522 return NULL;
1523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001524 return (PyObject *)unicode;
1525}
1526
Alexander Belopolsky40018472011-02-26 01:02:56 +00001527PyObject *
1528PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001529{
1530 size_t size = strlen(u);
1531 if (size > PY_SSIZE_T_MAX) {
1532 PyErr_SetString(PyExc_OverflowError, "input too long");
1533 return NULL;
1534 }
1535
1536 return PyUnicode_FromStringAndSize(u, size);
1537}
1538
Victor Stinnere57b1c02011-09-28 22:20:48 +02001539static PyObject*
Victor Stinner702c7342011-10-05 13:50:52 +02001540unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1541{
1542 PyObject *res = PyUnicode_New(size, 127);
1543 if (!res)
1544 return NULL;
1545 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1546 return res;
1547}
1548
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001549static Py_UCS4
1550kind_maxchar_limit(unsigned int kind)
1551{
1552 switch(kind) {
1553 case PyUnicode_1BYTE_KIND:
1554 return 0x80;
1555 case PyUnicode_2BYTE_KIND:
1556 return 0x100;
1557 case PyUnicode_4BYTE_KIND:
1558 return 0x10000;
1559 default:
1560 assert(0 && "invalid kind");
1561 return 0x10ffff;
1562 }
1563}
1564
Victor Stinner702c7342011-10-05 13:50:52 +02001565static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001566_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001569 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001571
1572 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 for (i = 0; i < size; i++) {
1574 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001575 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001577 }
1578 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001579 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 if (!res)
1581 return NULL;
1582 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001583 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001585}
1586
Victor Stinnere57b1c02011-09-28 22:20:48 +02001587static PyObject*
1588_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589{
1590 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001591 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001593
1594 assert(size >= 0);
1595 for (i = 0; i < size; i++) {
1596 if (u[i] > max_char) {
1597 max_char = u[i];
1598 if (max_char >= 256)
1599 break;
1600 }
1601 }
1602 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 if (!res)
1604 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001605 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1607 else
1608 for (i = 0; i < size; i++)
1609 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001610 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 return res;
1612}
1613
Victor Stinnere57b1c02011-09-28 22:20:48 +02001614static PyObject*
1615_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616{
1617 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001618 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001620
1621 assert(size >= 0);
1622 for (i = 0; i < size; i++) {
1623 if (u[i] > max_char) {
1624 max_char = u[i];
1625 if (max_char >= 0x10000)
1626 break;
1627 }
1628 }
1629 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 if (!res)
1631 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001632 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1634 else {
1635 int kind = PyUnicode_KIND(res);
1636 void *data = PyUnicode_DATA(res);
1637 for (i = 0; i < size; i++)
1638 PyUnicode_WRITE(kind, data, i, u[i]);
1639 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001640 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 return res;
1642}
1643
1644PyObject*
1645PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1646{
1647 switch(kind) {
1648 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001649 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001651 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001653 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001654 default:
1655 assert(0 && "invalid kind");
1656 PyErr_SetString(PyExc_SystemError, "invalid kind");
1657 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659}
1660
Victor Stinner034f6cf2011-09-30 02:26:44 +02001661PyObject*
1662PyUnicode_Copy(PyObject *unicode)
1663{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001664 Py_ssize_t size;
1665 PyObject *copy;
1666 void *data;
1667
Victor Stinner034f6cf2011-09-30 02:26:44 +02001668 if (!PyUnicode_Check(unicode)) {
1669 PyErr_BadInternalCall();
1670 return NULL;
1671 }
1672 if (PyUnicode_READY(unicode))
1673 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001674
1675 size = PyUnicode_GET_LENGTH(unicode);
1676 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1677 if (!copy)
1678 return NULL;
1679 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1680
1681 data = PyUnicode_DATA(unicode);
1682 switch (PyUnicode_KIND(unicode))
1683 {
1684 case PyUnicode_1BYTE_KIND:
1685 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1686 break;
1687 case PyUnicode_2BYTE_KIND:
1688 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1689 break;
1690 case PyUnicode_4BYTE_KIND:
1691 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1692 break;
1693 default:
1694 assert(0);
1695 break;
1696 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001697 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001698 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001699}
1700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701
Victor Stinnerbc603d12011-10-02 01:00:40 +02001702/* Widen Unicode objects to larger buffers. Don't write terminating null
1703 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704
1705void*
1706_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1707{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001708 Py_ssize_t len;
1709 void *result;
1710 unsigned int skind;
1711
1712 if (PyUnicode_READY(s))
1713 return NULL;
1714
1715 len = PyUnicode_GET_LENGTH(s);
1716 skind = PyUnicode_KIND(s);
1717 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001718 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 return NULL;
1720 }
1721 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001722 case PyUnicode_2BYTE_KIND:
1723 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1724 if (!result)
1725 return PyErr_NoMemory();
1726 assert(skind == PyUnicode_1BYTE_KIND);
1727 _PyUnicode_CONVERT_BYTES(
1728 Py_UCS1, Py_UCS2,
1729 PyUnicode_1BYTE_DATA(s),
1730 PyUnicode_1BYTE_DATA(s) + len,
1731 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001733 case PyUnicode_4BYTE_KIND:
1734 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1735 if (!result)
1736 return PyErr_NoMemory();
1737 if (skind == PyUnicode_2BYTE_KIND) {
1738 _PyUnicode_CONVERT_BYTES(
1739 Py_UCS2, Py_UCS4,
1740 PyUnicode_2BYTE_DATA(s),
1741 PyUnicode_2BYTE_DATA(s) + len,
1742 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001744 else {
1745 assert(skind == PyUnicode_1BYTE_KIND);
1746 _PyUnicode_CONVERT_BYTES(
1747 Py_UCS1, Py_UCS4,
1748 PyUnicode_1BYTE_DATA(s),
1749 PyUnicode_1BYTE_DATA(s) + len,
1750 result);
1751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001753 default:
1754 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 }
Victor Stinner01698042011-10-04 00:04:26 +02001756 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 return NULL;
1758}
1759
1760static Py_UCS4*
1761as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1762 int copy_null)
1763{
1764 int kind;
1765 void *data;
1766 Py_ssize_t len, targetlen;
1767 if (PyUnicode_READY(string) == -1)
1768 return NULL;
1769 kind = PyUnicode_KIND(string);
1770 data = PyUnicode_DATA(string);
1771 len = PyUnicode_GET_LENGTH(string);
1772 targetlen = len;
1773 if (copy_null)
1774 targetlen++;
1775 if (!target) {
1776 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1777 PyErr_NoMemory();
1778 return NULL;
1779 }
1780 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1781 if (!target) {
1782 PyErr_NoMemory();
1783 return NULL;
1784 }
1785 }
1786 else {
1787 if (targetsize < targetlen) {
1788 PyErr_Format(PyExc_SystemError,
1789 "string is longer than the buffer");
1790 if (copy_null && 0 < targetsize)
1791 target[0] = 0;
1792 return NULL;
1793 }
1794 }
1795 if (kind != PyUnicode_4BYTE_KIND) {
1796 Py_ssize_t i;
1797 for (i = 0; i < len; i++)
1798 target[i] = PyUnicode_READ(kind, data, i);
1799 }
1800 else
1801 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1802 if (copy_null)
1803 target[len] = 0;
1804 return target;
1805}
1806
1807Py_UCS4*
1808PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1809 int copy_null)
1810{
1811 if (target == NULL || targetsize < 1) {
1812 PyErr_BadInternalCall();
1813 return NULL;
1814 }
1815 return as_ucs4(string, target, targetsize, copy_null);
1816}
1817
1818Py_UCS4*
1819PyUnicode_AsUCS4Copy(PyObject *string)
1820{
1821 return as_ucs4(string, NULL, 0, 1);
1822}
1823
1824#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001825
Alexander Belopolsky40018472011-02-26 01:02:56 +00001826PyObject *
1827PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001830 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001832 PyErr_BadInternalCall();
1833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 }
1835
Martin v. Löwis790465f2008-04-05 20:41:37 +00001836 if (size == -1) {
1837 size = wcslen(w);
1838 }
1839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841}
1842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001844
Walter Dörwald346737f2007-05-31 10:44:43 +00001845static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001846makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1847 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001849 *fmt++ = '%';
1850 if (width) {
1851 if (zeropad)
1852 *fmt++ = '0';
1853 fmt += sprintf(fmt, "%d", width);
1854 }
1855 if (precision)
1856 fmt += sprintf(fmt, ".%d", precision);
1857 if (longflag)
1858 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001859 else if (longlongflag) {
1860 /* longlongflag should only ever be nonzero on machines with
1861 HAVE_LONG_LONG defined */
1862#ifdef HAVE_LONG_LONG
1863 char *f = PY_FORMAT_LONG_LONG;
1864 while (*f)
1865 *fmt++ = *f++;
1866#else
1867 /* we shouldn't ever get here */
1868 assert(0);
1869 *fmt++ = 'l';
1870#endif
1871 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001872 else if (size_tflag) {
1873 char *f = PY_FORMAT_SIZE_T;
1874 while (*f)
1875 *fmt++ = *f++;
1876 }
1877 *fmt++ = c;
1878 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001879}
1880
Victor Stinner96865452011-03-01 23:44:09 +00001881/* helper for PyUnicode_FromFormatV() */
1882
1883static const char*
1884parse_format_flags(const char *f,
1885 int *p_width, int *p_precision,
1886 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1887{
1888 int width, precision, longflag, longlongflag, size_tflag;
1889
1890 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1891 f++;
1892 width = 0;
1893 while (Py_ISDIGIT((unsigned)*f))
1894 width = (width*10) + *f++ - '0';
1895 precision = 0;
1896 if (*f == '.') {
1897 f++;
1898 while (Py_ISDIGIT((unsigned)*f))
1899 precision = (precision*10) + *f++ - '0';
1900 if (*f == '%') {
1901 /* "%.3%s" => f points to "3" */
1902 f--;
1903 }
1904 }
1905 if (*f == '\0') {
1906 /* bogus format "%.1" => go backward, f points to "1" */
1907 f--;
1908 }
1909 if (p_width != NULL)
1910 *p_width = width;
1911 if (p_precision != NULL)
1912 *p_precision = precision;
1913
1914 /* Handle %ld, %lu, %lld and %llu. */
1915 longflag = 0;
1916 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001917 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001918
1919 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001920 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001921 longflag = 1;
1922 ++f;
1923 }
1924#ifdef HAVE_LONG_LONG
1925 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001926 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001927 longlongflag = 1;
1928 f += 2;
1929 }
1930#endif
1931 }
1932 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001933 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001934 size_tflag = 1;
1935 ++f;
1936 }
1937 if (p_longflag != NULL)
1938 *p_longflag = longflag;
1939 if (p_longlongflag != NULL)
1940 *p_longlongflag = longlongflag;
1941 if (p_size_tflag != NULL)
1942 *p_size_tflag = size_tflag;
1943 return f;
1944}
1945
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001946/* maximum number of characters required for output of %ld. 21 characters
1947 allows for 64-bit integers (in decimal) and an optional sign. */
1948#define MAX_LONG_CHARS 21
1949/* maximum number of characters required for output of %lld.
1950 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1951 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1952#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1953
Walter Dörwaldd2034312007-05-18 16:29:38 +00001954PyObject *
1955PyUnicode_FromFormatV(const char *format, va_list vargs)
1956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001957 va_list count;
1958 Py_ssize_t callcount = 0;
1959 PyObject **callresults = NULL;
1960 PyObject **callresult = NULL;
1961 Py_ssize_t n = 0;
1962 int width = 0;
1963 int precision = 0;
1964 int zeropad;
1965 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001967 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001968 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1970 Py_UCS4 argmaxchar;
1971 Py_ssize_t numbersize = 0;
1972 char *numberresults = NULL;
1973 char *numberresult = NULL;
1974 Py_ssize_t i;
1975 int kind;
1976 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001977
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001978 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001979 /* step 1: count the number of %S/%R/%A/%s format specifications
1980 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1981 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02001983 * also estimate a upper bound for all the number formats in the string,
1984 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001986 for (f = format; *f; f++) {
1987 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001988 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1990 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1991 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1992 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001995#ifdef HAVE_LONG_LONG
1996 if (longlongflag) {
1997 if (width < MAX_LONG_LONG_CHARS)
1998 width = MAX_LONG_LONG_CHARS;
1999 }
2000 else
2001#endif
2002 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2003 including sign. Decimal takes the most space. This
2004 isn't enough for octal. If a width is specified we
2005 need more (which we allocate later). */
2006 if (width < MAX_LONG_CHARS)
2007 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008
2009 /* account for the size + '\0' to separate numbers
2010 inside of the numberresults buffer */
2011 numbersize += (width + 1);
2012 }
2013 }
2014 else if ((unsigned char)*f > 127) {
2015 PyErr_Format(PyExc_ValueError,
2016 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2017 "string, got a non-ASCII byte: 0x%02x",
2018 (unsigned char)*f);
2019 return NULL;
2020 }
2021 }
2022 /* step 2: allocate memory for the results of
2023 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2024 if (callcount) {
2025 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2026 if (!callresults) {
2027 PyErr_NoMemory();
2028 return NULL;
2029 }
2030 callresult = callresults;
2031 }
2032 /* step 2.5: allocate memory for the results of formating numbers */
2033 if (numbersize) {
2034 numberresults = PyObject_Malloc(numbersize);
2035 if (!numberresults) {
2036 PyErr_NoMemory();
2037 goto fail;
2038 }
2039 numberresult = numberresults;
2040 }
2041
2042 /* step 3: format numbers and figure out how large a buffer we need */
2043 for (f = format; *f; f++) {
2044 if (*f == '%') {
2045 const char* p;
2046 int longflag;
2047 int longlongflag;
2048 int size_tflag;
2049 int numprinted;
2050
2051 p = f;
2052 zeropad = (f[1] == '0');
2053 f = parse_format_flags(f, &width, &precision,
2054 &longflag, &longlongflag, &size_tflag);
2055 switch (*f) {
2056 case 'c':
2057 {
2058 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002059 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060 n++;
2061 break;
2062 }
2063 case '%':
2064 n++;
2065 break;
2066 case 'i':
2067 case 'd':
2068 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2069 width, precision, *f);
2070 if (longflag)
2071 numprinted = sprintf(numberresult, fmt,
2072 va_arg(count, long));
2073#ifdef HAVE_LONG_LONG
2074 else if (longlongflag)
2075 numprinted = sprintf(numberresult, fmt,
2076 va_arg(count, PY_LONG_LONG));
2077#endif
2078 else if (size_tflag)
2079 numprinted = sprintf(numberresult, fmt,
2080 va_arg(count, Py_ssize_t));
2081 else
2082 numprinted = sprintf(numberresult, fmt,
2083 va_arg(count, int));
2084 n += numprinted;
2085 /* advance by +1 to skip over the '\0' */
2086 numberresult += (numprinted + 1);
2087 assert(*(numberresult - 1) == '\0');
2088 assert(*(numberresult - 2) != '\0');
2089 assert(numprinted >= 0);
2090 assert(numberresult <= numberresults + numbersize);
2091 break;
2092 case 'u':
2093 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2094 width, precision, 'u');
2095 if (longflag)
2096 numprinted = sprintf(numberresult, fmt,
2097 va_arg(count, unsigned long));
2098#ifdef HAVE_LONG_LONG
2099 else if (longlongflag)
2100 numprinted = sprintf(numberresult, fmt,
2101 va_arg(count, unsigned PY_LONG_LONG));
2102#endif
2103 else if (size_tflag)
2104 numprinted = sprintf(numberresult, fmt,
2105 va_arg(count, size_t));
2106 else
2107 numprinted = sprintf(numberresult, fmt,
2108 va_arg(count, unsigned int));
2109 n += numprinted;
2110 numberresult += (numprinted + 1);
2111 assert(*(numberresult - 1) == '\0');
2112 assert(*(numberresult - 2) != '\0');
2113 assert(numprinted >= 0);
2114 assert(numberresult <= numberresults + numbersize);
2115 break;
2116 case 'x':
2117 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2118 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2119 n += numprinted;
2120 numberresult += (numprinted + 1);
2121 assert(*(numberresult - 1) == '\0');
2122 assert(*(numberresult - 2) != '\0');
2123 assert(numprinted >= 0);
2124 assert(numberresult <= numberresults + numbersize);
2125 break;
2126 case 'p':
2127 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2128 /* %p is ill-defined: ensure leading 0x. */
2129 if (numberresult[1] == 'X')
2130 numberresult[1] = 'x';
2131 else if (numberresult[1] != 'x') {
2132 memmove(numberresult + 2, numberresult,
2133 strlen(numberresult) + 1);
2134 numberresult[0] = '0';
2135 numberresult[1] = 'x';
2136 numprinted += 2;
2137 }
2138 n += numprinted;
2139 numberresult += (numprinted + 1);
2140 assert(*(numberresult - 1) == '\0');
2141 assert(*(numberresult - 2) != '\0');
2142 assert(numprinted >= 0);
2143 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 break;
2145 case 's':
2146 {
2147 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002148 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002149 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2150 if (!str)
2151 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 /* since PyUnicode_DecodeUTF8 returns already flexible
2153 unicode objects, there is no need to call ready on them */
2154 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002155 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002156 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002157 /* Remember the str and switch to the next slot */
2158 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002159 break;
2160 }
2161 case 'U':
2162 {
2163 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002164 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 if (PyUnicode_READY(obj) == -1)
2166 goto fail;
2167 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002168 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002170 break;
2171 }
2172 case 'V':
2173 {
2174 PyObject *obj = va_arg(count, PyObject *);
2175 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002176 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002177 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002178 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002179 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 if (PyUnicode_READY(obj) == -1)
2181 goto fail;
2182 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002183 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002185 *callresult++ = NULL;
2186 }
2187 else {
2188 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2189 if (!str_obj)
2190 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002191 if (PyUnicode_READY(str_obj)) {
2192 Py_DECREF(str_obj);
2193 goto fail;
2194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002196 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002198 *callresult++ = str_obj;
2199 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002200 break;
2201 }
2202 case 'S':
2203 {
2204 PyObject *obj = va_arg(count, PyObject *);
2205 PyObject *str;
2206 assert(obj);
2207 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002209 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002211 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002213 /* Remember the str and switch to the next slot */
2214 *callresult++ = str;
2215 break;
2216 }
2217 case 'R':
2218 {
2219 PyObject *obj = va_arg(count, PyObject *);
2220 PyObject *repr;
2221 assert(obj);
2222 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002224 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002226 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 /* Remember the repr and switch to the next slot */
2229 *callresult++ = repr;
2230 break;
2231 }
2232 case 'A':
2233 {
2234 PyObject *obj = va_arg(count, PyObject *);
2235 PyObject *ascii;
2236 assert(obj);
2237 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002239 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002241 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002243 /* Remember the repr and switch to the next slot */
2244 *callresult++ = ascii;
2245 break;
2246 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 default:
2248 /* if we stumble upon an unknown
2249 formatting code, copy the rest of
2250 the format string to the output
2251 string. (we cannot just skip the
2252 code, since there's no way to know
2253 what's in the argument list) */
2254 n += strlen(p);
2255 goto expand;
2256 }
2257 } else
2258 n++;
2259 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002260 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002261 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002263 we don't have to resize the string.
2264 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 if (!string)
2267 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 kind = PyUnicode_KIND(string);
2269 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002275 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002276
2277 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2279 /* checking for == because the last argument could be a empty
2280 string, which causes i to point to end, the assert at the end of
2281 the loop */
2282 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002283
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 switch (*f) {
2285 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002286 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 const int ordinal = va_arg(vargs, int);
2288 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002289 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002290 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002291 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002292 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002294 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 case 'p':
2296 /* unused, since we already have the result */
2297 if (*f == 'p')
2298 (void) va_arg(vargs, void *);
2299 else
2300 (void) va_arg(vargs, int);
2301 /* extract the result from numberresults and append. */
2302 for (; *numberresult; ++i, ++numberresult)
2303 PyUnicode_WRITE(kind, data, i, *numberresult);
2304 /* skip over the separating '\0' */
2305 assert(*numberresult == '\0');
2306 numberresult++;
2307 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002308 break;
2309 case 's':
2310 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002311 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002313 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 size = PyUnicode_GET_LENGTH(*callresult);
2315 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002316 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2317 *callresult, 0,
2318 size) < 0)
2319 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002321 /* We're done with the unicode()/repr() => forget it */
2322 Py_DECREF(*callresult);
2323 /* switch to next unicode()/repr() result */
2324 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002325 break;
2326 }
2327 case 'U':
2328 {
2329 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002330 Py_ssize_t size;
2331 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2332 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002333 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2334 obj, 0,
2335 size) < 0)
2336 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002338 break;
2339 }
2340 case 'V':
2341 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002343 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002344 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002345 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002346 size = PyUnicode_GET_LENGTH(obj);
2347 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002348 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2349 obj, 0,
2350 size) < 0)
2351 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 size = PyUnicode_GET_LENGTH(*callresult);
2355 assert(PyUnicode_KIND(*callresult) <=
2356 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002357 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2358 *callresult,
2359 0, size) < 0)
2360 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002362 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002363 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002364 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002365 break;
2366 }
2367 case 'S':
2368 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002369 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002370 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002371 /* unused, since we already have the result */
2372 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002374 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2375 *callresult, 0,
2376 PyUnicode_GET_LENGTH(*callresult)) < 0)
2377 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 /* We're done with the unicode()/repr() => forget it */
2380 Py_DECREF(*callresult);
2381 /* switch to next unicode()/repr() result */
2382 ++callresult;
2383 break;
2384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002385 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 break;
2388 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 for (; *p; ++p, ++i)
2390 PyUnicode_WRITE(kind, data, i, *p);
2391 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002392 goto end;
2393 }
Victor Stinner1205f272010-09-11 00:54:47 +00002394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 else {
2396 assert(i < PyUnicode_GET_LENGTH(string));
2397 PyUnicode_WRITE(kind, data, i++, *f);
2398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002401
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 if (callresults)
2404 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 if (numberresults)
2406 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002407 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002409 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002410 if (callresults) {
2411 PyObject **callresult2 = callresults;
2412 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002413 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002414 ++callresult2;
2415 }
2416 PyObject_Free(callresults);
2417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 if (numberresults)
2419 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002421}
2422
Walter Dörwaldd2034312007-05-18 16:29:38 +00002423PyObject *
2424PyUnicode_FromFormat(const char *format, ...)
2425{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002426 PyObject* ret;
2427 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002428
2429#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002430 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002431#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002432 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002433#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 ret = PyUnicode_FromFormatV(format, vargs);
2435 va_end(vargs);
2436 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002437}
2438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439#ifdef HAVE_WCHAR_H
2440
Victor Stinner5593d8a2010-10-02 11:11:27 +00002441/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2442 convert a Unicode object to a wide character string.
2443
Victor Stinnerd88d9832011-09-06 02:00:05 +02002444 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002445 character) required to convert the unicode object. Ignore size argument.
2446
Victor Stinnerd88d9832011-09-06 02:00:05 +02002447 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002448 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002449 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002450static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002451unicode_aswidechar(PyUnicodeObject *unicode,
2452 wchar_t *w,
2453 Py_ssize_t size)
2454{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002455 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 const wchar_t *wstr;
2457
2458 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2459 if (wstr == NULL)
2460 return -1;
2461
Victor Stinner5593d8a2010-10-02 11:11:27 +00002462 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002463 if (size > res)
2464 size = res + 1;
2465 else
2466 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002468 return res;
2469 }
2470 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002472}
2473
2474Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002475PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002476 wchar_t *w,
2477 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478{
2479 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 PyErr_BadInternalCall();
2481 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002483 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484}
2485
Victor Stinner137c34c2010-09-29 10:25:54 +00002486wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002487PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002488 Py_ssize_t *size)
2489{
2490 wchar_t* buffer;
2491 Py_ssize_t buflen;
2492
2493 if (unicode == NULL) {
2494 PyErr_BadInternalCall();
2495 return NULL;
2496 }
2497
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002498 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 if (buflen == -1)
2500 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002501 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002502 PyErr_NoMemory();
2503 return NULL;
2504 }
2505
Victor Stinner137c34c2010-09-29 10:25:54 +00002506 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2507 if (buffer == NULL) {
2508 PyErr_NoMemory();
2509 return NULL;
2510 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002511 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512 if (buflen == -1)
2513 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002514 if (size != NULL)
2515 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002516 return buffer;
2517}
2518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520
Alexander Belopolsky40018472011-02-26 01:02:56 +00002521PyObject *
2522PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002525 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002526 PyErr_SetString(PyExc_ValueError,
2527 "chr() arg not in range(0x110000)");
2528 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002529 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 if (ordinal < 256)
2532 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 v = PyUnicode_New(1, ordinal);
2535 if (v == NULL)
2536 return NULL;
2537 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002538 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002540}
2541
Alexander Belopolsky40018472011-02-26 01:02:56 +00002542PyObject *
2543PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002545 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002546 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002547 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002548 if (PyUnicode_READY(obj))
2549 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002550 Py_INCREF(obj);
2551 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002552 }
2553 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002554 /* For a Unicode subtype that's not a Unicode object,
2555 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002556 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002557 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002558 PyErr_Format(PyExc_TypeError,
2559 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002560 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002561 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002562}
2563
Alexander Belopolsky40018472011-02-26 01:02:56 +00002564PyObject *
2565PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002566 const char *encoding,
2567 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002568{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002569 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002570 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002573 PyErr_BadInternalCall();
2574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002576
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002577 /* Decoding bytes objects is the most common case and should be fast */
2578 if (PyBytes_Check(obj)) {
2579 if (PyBytes_GET_SIZE(obj) == 0) {
2580 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002581 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002582 }
2583 else {
2584 v = PyUnicode_Decode(
2585 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2586 encoding, errors);
2587 }
2588 return v;
2589 }
2590
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002591 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002592 PyErr_SetString(PyExc_TypeError,
2593 "decoding str is not supported");
2594 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002595 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002596
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002597 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2598 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2599 PyErr_Format(PyExc_TypeError,
2600 "coercing to str: need bytes, bytearray "
2601 "or buffer-like object, %.80s found",
2602 Py_TYPE(obj)->tp_name);
2603 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002604 }
Tim Petersced69f82003-09-16 20:30:58 +00002605
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002606 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002608 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 }
Tim Petersced69f82003-09-16 20:30:58 +00002610 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002611 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002612
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002613 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002614 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615}
2616
Victor Stinner600d3be2010-06-10 12:00:55 +00002617/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002618 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2619 1 on success. */
2620static int
2621normalize_encoding(const char *encoding,
2622 char *lower,
2623 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002625 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002626 char *l;
2627 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002629 e = encoding;
2630 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002631 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002632 while (*e) {
2633 if (l == l_end)
2634 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002635 if (Py_ISUPPER(*e)) {
2636 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002637 }
2638 else if (*e == '_') {
2639 *l++ = '-';
2640 e++;
2641 }
2642 else {
2643 *l++ = *e++;
2644 }
2645 }
2646 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002647 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002648}
2649
Alexander Belopolsky40018472011-02-26 01:02:56 +00002650PyObject *
2651PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002652 Py_ssize_t size,
2653 const char *encoding,
2654 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002655{
2656 PyObject *buffer = NULL, *unicode;
2657 Py_buffer info;
2658 char lower[11]; /* Enough for any encoding shortcut */
2659
2660 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002661 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002662
2663 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002664 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002665 if ((strcmp(lower, "utf-8") == 0) ||
2666 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002667 return PyUnicode_DecodeUTF8(s, size, errors);
2668 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002669 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002670 (strcmp(lower, "iso-8859-1") == 0))
2671 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002672#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002673 else if (strcmp(lower, "mbcs") == 0)
2674 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002675#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002676 else if (strcmp(lower, "ascii") == 0)
2677 return PyUnicode_DecodeASCII(s, size, errors);
2678 else if (strcmp(lower, "utf-16") == 0)
2679 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2680 else if (strcmp(lower, "utf-32") == 0)
2681 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
2684 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002685 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002686 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002687 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002688 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 if (buffer == NULL)
2690 goto onError;
2691 unicode = PyCodec_Decode(buffer, encoding, errors);
2692 if (unicode == NULL)
2693 goto onError;
2694 if (!PyUnicode_Check(unicode)) {
2695 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002696 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002697 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 Py_DECREF(unicode);
2699 goto onError;
2700 }
2701 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002702#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002703 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002704 Py_DECREF(unicode);
2705 return NULL;
2706 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002707#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002708 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002710
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 Py_XDECREF(buffer);
2713 return NULL;
2714}
2715
Alexander Belopolsky40018472011-02-26 01:02:56 +00002716PyObject *
2717PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002718 const char *encoding,
2719 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002720{
2721 PyObject *v;
2722
2723 if (!PyUnicode_Check(unicode)) {
2724 PyErr_BadArgument();
2725 goto onError;
2726 }
2727
2728 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002729 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002730
2731 /* Decode via the codec registry */
2732 v = PyCodec_Decode(unicode, encoding, errors);
2733 if (v == NULL)
2734 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002735 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002736 return v;
2737
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002739 return NULL;
2740}
2741
Alexander Belopolsky40018472011-02-26 01:02:56 +00002742PyObject *
2743PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002744 const char *encoding,
2745 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002746{
2747 PyObject *v;
2748
2749 if (!PyUnicode_Check(unicode)) {
2750 PyErr_BadArgument();
2751 goto onError;
2752 }
2753
2754 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002756
2757 /* Decode via the codec registry */
2758 v = PyCodec_Decode(unicode, encoding, errors);
2759 if (v == NULL)
2760 goto onError;
2761 if (!PyUnicode_Check(v)) {
2762 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002763 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002764 Py_TYPE(v)->tp_name);
2765 Py_DECREF(v);
2766 goto onError;
2767 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002768 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002769 return v;
2770
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002772 return NULL;
2773}
2774
Alexander Belopolsky40018472011-02-26 01:02:56 +00002775PyObject *
2776PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002777 Py_ssize_t size,
2778 const char *encoding,
2779 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780{
2781 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002782
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 unicode = PyUnicode_FromUnicode(s, size);
2784 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2787 Py_DECREF(unicode);
2788 return v;
2789}
2790
Alexander Belopolsky40018472011-02-26 01:02:56 +00002791PyObject *
2792PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002793 const char *encoding,
2794 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002795{
2796 PyObject *v;
2797
2798 if (!PyUnicode_Check(unicode)) {
2799 PyErr_BadArgument();
2800 goto onError;
2801 }
2802
2803 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002805
2806 /* Encode via the codec registry */
2807 v = PyCodec_Encode(unicode, encoding, errors);
2808 if (v == NULL)
2809 goto onError;
2810 return v;
2811
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002813 return NULL;
2814}
2815
Victor Stinnerad158722010-10-27 00:25:46 +00002816PyObject *
2817PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002818{
Victor Stinner99b95382011-07-04 14:23:54 +02002819#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002820 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2821 PyUnicode_GET_SIZE(unicode),
2822 NULL);
2823#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002825#else
Victor Stinner793b5312011-04-27 00:24:21 +02002826 PyInterpreterState *interp = PyThreadState_GET()->interp;
2827 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2828 cannot use it to encode and decode filenames before it is loaded. Load
2829 the Python codec requires to encode at least its own filename. Use the C
2830 version of the locale codec until the codec registry is initialized and
2831 the Python codec is loaded.
2832
2833 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2834 cannot only rely on it: check also interp->fscodec_initialized for
2835 subinterpreters. */
2836 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002837 return PyUnicode_AsEncodedString(unicode,
2838 Py_FileSystemDefaultEncoding,
2839 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002840 }
2841 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002842 /* locale encoding with surrogateescape */
2843 wchar_t *wchar;
2844 char *bytes;
2845 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002846 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002847
2848 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2849 if (wchar == NULL)
2850 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002851 bytes = _Py_wchar2char(wchar, &error_pos);
2852 if (bytes == NULL) {
2853 if (error_pos != (size_t)-1) {
2854 char *errmsg = strerror(errno);
2855 PyObject *exc = NULL;
2856 if (errmsg == NULL)
2857 errmsg = "Py_wchar2char() failed";
2858 raise_encode_exception(&exc,
2859 "filesystemencoding",
2860 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2861 error_pos, error_pos+1,
2862 errmsg);
2863 Py_XDECREF(exc);
2864 }
2865 else
2866 PyErr_NoMemory();
2867 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002868 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002869 }
2870 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002871
2872 bytes_obj = PyBytes_FromString(bytes);
2873 PyMem_Free(bytes);
2874 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002875 }
Victor Stinnerad158722010-10-27 00:25:46 +00002876#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002877}
2878
Alexander Belopolsky40018472011-02-26 01:02:56 +00002879PyObject *
2880PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002881 const char *encoding,
2882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883{
2884 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002885 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002886
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 if (!PyUnicode_Check(unicode)) {
2888 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 }
Fred Drakee4315f52000-05-09 19:53:39 +00002891
Victor Stinner2f283c22011-03-02 01:21:46 +00002892 if (encoding == NULL) {
2893 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002894 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002895 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002896 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002897 }
Fred Drakee4315f52000-05-09 19:53:39 +00002898
2899 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002900 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002901 if ((strcmp(lower, "utf-8") == 0) ||
2902 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002903 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002904 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002905 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002906 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002908 }
Victor Stinner37296e82010-06-10 13:36:23 +00002909 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002910 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002911 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002913#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002914 else if (strcmp(lower, "mbcs") == 0)
2915 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2916 PyUnicode_GET_SIZE(unicode),
2917 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002918#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002919 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922
2923 /* Encode via the codec registry */
2924 v = PyCodec_Encode(unicode, encoding, errors);
2925 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002926 return NULL;
2927
2928 /* The normal path */
2929 if (PyBytes_Check(v))
2930 return v;
2931
2932 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002933 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002934 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002935 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002936
2937 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2938 "encoder %s returned bytearray instead of bytes",
2939 encoding);
2940 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002941 Py_DECREF(v);
2942 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002943 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002944
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002945 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2946 Py_DECREF(v);
2947 return b;
2948 }
2949
2950 PyErr_Format(PyExc_TypeError,
2951 "encoder did not return a bytes object (type=%.400s)",
2952 Py_TYPE(v)->tp_name);
2953 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002954 return NULL;
2955}
2956
Alexander Belopolsky40018472011-02-26 01:02:56 +00002957PyObject *
2958PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002959 const char *encoding,
2960 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002961{
2962 PyObject *v;
2963
2964 if (!PyUnicode_Check(unicode)) {
2965 PyErr_BadArgument();
2966 goto onError;
2967 }
2968
2969 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002970 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002971
2972 /* Encode via the codec registry */
2973 v = PyCodec_Encode(unicode, encoding, errors);
2974 if (v == NULL)
2975 goto onError;
2976 if (!PyUnicode_Check(v)) {
2977 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002978 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002979 Py_TYPE(v)->tp_name);
2980 Py_DECREF(v);
2981 goto onError;
2982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002984
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 return NULL;
2987}
2988
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002989PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002990PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002991 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002992 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2993}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002994
Christian Heimes5894ba72007-11-04 11:43:14 +00002995PyObject*
2996PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2997{
Victor Stinner99b95382011-07-04 14:23:54 +02002998#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002999 return PyUnicode_DecodeMBCS(s, size, NULL);
3000#elif defined(__APPLE__)
3001 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3002#else
Victor Stinner793b5312011-04-27 00:24:21 +02003003 PyInterpreterState *interp = PyThreadState_GET()->interp;
3004 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3005 cannot use it to encode and decode filenames before it is loaded. Load
3006 the Python codec requires to encode at least its own filename. Use the C
3007 version of the locale codec until the codec registry is initialized and
3008 the Python codec is loaded.
3009
3010 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3011 cannot only rely on it: check also interp->fscodec_initialized for
3012 subinterpreters. */
3013 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003014 return PyUnicode_Decode(s, size,
3015 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003016 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003017 }
3018 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003019 /* locale encoding with surrogateescape */
3020 wchar_t *wchar;
3021 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003022 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003023
3024 if (s[size] != '\0' || size != strlen(s)) {
3025 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3026 return NULL;
3027 }
3028
Victor Stinner168e1172010-10-16 23:16:16 +00003029 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003030 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003031 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003032
Victor Stinner168e1172010-10-16 23:16:16 +00003033 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003034 PyMem_Free(wchar);
3035 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003036 }
Victor Stinnerad158722010-10-27 00:25:46 +00003037#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003038}
3039
Martin v. Löwis011e8422009-05-05 04:43:17 +00003040
3041int
3042PyUnicode_FSConverter(PyObject* arg, void* addr)
3043{
3044 PyObject *output = NULL;
3045 Py_ssize_t size;
3046 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003047 if (arg == NULL) {
3048 Py_DECREF(*(PyObject**)addr);
3049 return 1;
3050 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003051 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003052 output = arg;
3053 Py_INCREF(output);
3054 }
3055 else {
3056 arg = PyUnicode_FromObject(arg);
3057 if (!arg)
3058 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003059 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003060 Py_DECREF(arg);
3061 if (!output)
3062 return 0;
3063 if (!PyBytes_Check(output)) {
3064 Py_DECREF(output);
3065 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3066 return 0;
3067 }
3068 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003069 size = PyBytes_GET_SIZE(output);
3070 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003071 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003072 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003073 Py_DECREF(output);
3074 return 0;
3075 }
3076 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003077 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003078}
3079
3080
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003081int
3082PyUnicode_FSDecoder(PyObject* arg, void* addr)
3083{
3084 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003085 if (arg == NULL) {
3086 Py_DECREF(*(PyObject**)addr);
3087 return 1;
3088 }
3089 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003090 if (PyUnicode_READY(arg))
3091 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003092 output = arg;
3093 Py_INCREF(output);
3094 }
3095 else {
3096 arg = PyBytes_FromObject(arg);
3097 if (!arg)
3098 return 0;
3099 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3100 PyBytes_GET_SIZE(arg));
3101 Py_DECREF(arg);
3102 if (!output)
3103 return 0;
3104 if (!PyUnicode_Check(output)) {
3105 Py_DECREF(output);
3106 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3107 return 0;
3108 }
3109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003110 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3111 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003112 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3113 Py_DECREF(output);
3114 return 0;
3115 }
3116 *(PyObject**)addr = output;
3117 return Py_CLEANUP_SUPPORTED;
3118}
3119
3120
Martin v. Löwis5b222132007-06-10 09:51:05 +00003121char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003122PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003123{
Christian Heimesf3863112007-11-22 07:46:41 +00003124 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003125 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3126
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003127 if (!PyUnicode_Check(unicode)) {
3128 PyErr_BadArgument();
3129 return NULL;
3130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003131 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003132 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003133
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003134 if (PyUnicode_UTF8(unicode) == NULL) {
3135 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003136 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3137 if (bytes == NULL)
3138 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003139 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3140 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141 Py_DECREF(bytes);
3142 return NULL;
3143 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003144 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3145 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003146 Py_DECREF(bytes);
3147 }
3148
3149 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003150 *psize = PyUnicode_UTF8_LENGTH(unicode);
3151 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003152}
3153
3154char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003155PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3158}
3159
3160#ifdef Py_DEBUG
3161int unicode_as_unicode_calls = 0;
3162#endif
3163
3164
3165Py_UNICODE *
3166PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3167{
3168 PyUnicodeObject *u;
3169 const unsigned char *one_byte;
3170#if SIZEOF_WCHAR_T == 4
3171 const Py_UCS2 *two_bytes;
3172#else
3173 const Py_UCS4 *four_bytes;
3174 const Py_UCS4 *ucs4_end;
3175 Py_ssize_t num_surrogates;
3176#endif
3177 wchar_t *w;
3178 wchar_t *wchar_end;
3179
3180 if (!PyUnicode_Check(unicode)) {
3181 PyErr_BadArgument();
3182 return NULL;
3183 }
3184 u = (PyUnicodeObject*)unicode;
3185 if (_PyUnicode_WSTR(u) == NULL) {
3186 /* Non-ASCII compact unicode object */
3187 assert(_PyUnicode_KIND(u) != 0);
3188 assert(PyUnicode_IS_READY(u));
3189
3190#ifdef Py_DEBUG
3191 ++unicode_as_unicode_calls;
3192#endif
3193
3194 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3195#if SIZEOF_WCHAR_T == 2
3196 four_bytes = PyUnicode_4BYTE_DATA(u);
3197 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3198 num_surrogates = 0;
3199
3200 for (; four_bytes < ucs4_end; ++four_bytes) {
3201 if (*four_bytes > 0xFFFF)
3202 ++num_surrogates;
3203 }
3204
3205 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3206 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3207 if (!_PyUnicode_WSTR(u)) {
3208 PyErr_NoMemory();
3209 return NULL;
3210 }
3211 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3212
3213 w = _PyUnicode_WSTR(u);
3214 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3215 four_bytes = PyUnicode_4BYTE_DATA(u);
3216 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3217 if (*four_bytes > 0xFFFF) {
3218 /* encode surrogate pair in this case */
3219 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3220 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3221 }
3222 else
3223 *w = *four_bytes;
3224
3225 if (w > wchar_end) {
3226 assert(0 && "Miscalculated string end");
3227 }
3228 }
3229 *w = 0;
3230#else
3231 /* sizeof(wchar_t) == 4 */
3232 Py_FatalError("Impossible unicode object state, wstr and str "
3233 "should share memory already.");
3234 return NULL;
3235#endif
3236 }
3237 else {
3238 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3239 (_PyUnicode_LENGTH(u) + 1));
3240 if (!_PyUnicode_WSTR(u)) {
3241 PyErr_NoMemory();
3242 return NULL;
3243 }
3244 if (!PyUnicode_IS_COMPACT_ASCII(u))
3245 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3246 w = _PyUnicode_WSTR(u);
3247 wchar_end = w + _PyUnicode_LENGTH(u);
3248
3249 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3250 one_byte = PyUnicode_1BYTE_DATA(u);
3251 for (; w < wchar_end; ++one_byte, ++w)
3252 *w = *one_byte;
3253 /* null-terminate the wstr */
3254 *w = 0;
3255 }
3256 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3257#if SIZEOF_WCHAR_T == 4
3258 two_bytes = PyUnicode_2BYTE_DATA(u);
3259 for (; w < wchar_end; ++two_bytes, ++w)
3260 *w = *two_bytes;
3261 /* null-terminate the wstr */
3262 *w = 0;
3263#else
3264 /* sizeof(wchar_t) == 2 */
3265 PyObject_FREE(_PyUnicode_WSTR(u));
3266 _PyUnicode_WSTR(u) = NULL;
3267 Py_FatalError("Impossible unicode object state, wstr "
3268 "and str should share memory already.");
3269 return NULL;
3270#endif
3271 }
3272 else {
3273 assert(0 && "This should never happen.");
3274 }
3275 }
3276 }
3277 if (size != NULL)
3278 *size = PyUnicode_WSTR_LENGTH(u);
3279 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003280}
3281
Alexander Belopolsky40018472011-02-26 01:02:56 +00003282Py_UNICODE *
3283PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003285 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286}
3287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003288
Alexander Belopolsky40018472011-02-26 01:02:56 +00003289Py_ssize_t
3290PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291{
3292 if (!PyUnicode_Check(unicode)) {
3293 PyErr_BadArgument();
3294 goto onError;
3295 }
3296 return PyUnicode_GET_SIZE(unicode);
3297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 return -1;
3300}
3301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003302Py_ssize_t
3303PyUnicode_GetLength(PyObject *unicode)
3304{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003305 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003306 PyErr_BadArgument();
3307 return -1;
3308 }
3309
3310 return PyUnicode_GET_LENGTH(unicode);
3311}
3312
3313Py_UCS4
3314PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3315{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003316 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3317 PyErr_BadArgument();
3318 return (Py_UCS4)-1;
3319 }
3320 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3321 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322 return (Py_UCS4)-1;
3323 }
3324 return PyUnicode_READ_CHAR(unicode, index);
3325}
3326
3327int
3328PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3329{
3330 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003331 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003332 return -1;
3333 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003334 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3335 PyErr_SetString(PyExc_IndexError, "string index out of range");
3336 return -1;
3337 }
3338 if (_PyUnicode_Dirty(unicode))
3339 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003340 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3341 index, ch);
3342 return 0;
3343}
3344
Alexander Belopolsky40018472011-02-26 01:02:56 +00003345const char *
3346PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003347{
Victor Stinner42cb4622010-09-01 19:39:01 +00003348 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003349}
3350
Victor Stinner554f3f02010-06-16 23:33:54 +00003351/* create or adjust a UnicodeDecodeError */
3352static void
3353make_decode_exception(PyObject **exceptionObject,
3354 const char *encoding,
3355 const char *input, Py_ssize_t length,
3356 Py_ssize_t startpos, Py_ssize_t endpos,
3357 const char *reason)
3358{
3359 if (*exceptionObject == NULL) {
3360 *exceptionObject = PyUnicodeDecodeError_Create(
3361 encoding, input, length, startpos, endpos, reason);
3362 }
3363 else {
3364 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3365 goto onError;
3366 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3367 goto onError;
3368 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3369 goto onError;
3370 }
3371 return;
3372
3373onError:
3374 Py_DECREF(*exceptionObject);
3375 *exceptionObject = NULL;
3376}
3377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378/* error handling callback helper:
3379 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003380 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 and adjust various state variables.
3382 return 0 on success, -1 on error
3383*/
3384
Alexander Belopolsky40018472011-02-26 01:02:56 +00003385static int
3386unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003387 const char *encoding, const char *reason,
3388 const char **input, const char **inend, Py_ssize_t *startinpos,
3389 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3390 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003392 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393
3394 PyObject *restuple = NULL;
3395 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003396 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003397 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003398 Py_ssize_t requiredsize;
3399 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003401 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003402 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 int res = -1;
3404
3405 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003406 *errorHandler = PyCodec_LookupError(errors);
3407 if (*errorHandler == NULL)
3408 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 }
3410
Victor Stinner554f3f02010-06-16 23:33:54 +00003411 make_decode_exception(exceptionObject,
3412 encoding,
3413 *input, *inend - *input,
3414 *startinpos, *endinpos,
3415 reason);
3416 if (*exceptionObject == NULL)
3417 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418
3419 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3420 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003423 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003425 }
3426 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003428
3429 /* Copy back the bytes variables, which might have been modified by the
3430 callback */
3431 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3432 if (!inputobj)
3433 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003434 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003436 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003437 *input = PyBytes_AS_STRING(inputobj);
3438 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003439 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003440 /* we can DECREF safely, as the exception has another reference,
3441 so the object won't go away. */
3442 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003446 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003447 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3448 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003449 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450
3451 /* need more space? (at least enough for what we
3452 have+the replacement+the rest of the string (starting
3453 at the new input position), so we won't have to check space
3454 when there are no errors in the rest of the string) */
3455 repptr = PyUnicode_AS_UNICODE(repunicode);
3456 repsize = PyUnicode_GET_SIZE(repunicode);
3457 requiredsize = *outpos + repsize + insize-newpos;
3458 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003459 if (requiredsize<2*outsize)
3460 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003461 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 goto onError;
3463 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 }
3465 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003466 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 Py_UNICODE_COPY(*outptr, repptr, repsize);
3468 *outptr += repsize;
3469 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 /* we made it! */
3472 res = 0;
3473
Benjamin Peterson29060642009-01-31 22:14:21 +00003474 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 Py_XDECREF(restuple);
3476 return res;
3477}
3478
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003479/* --- UTF-7 Codec -------------------------------------------------------- */
3480
Antoine Pitrou244651a2009-05-04 18:56:13 +00003481/* See RFC2152 for details. We encode conservatively and decode liberally. */
3482
3483/* Three simple macros defining base-64. */
3484
3485/* Is c a base-64 character? */
3486
3487#define IS_BASE64(c) \
3488 (((c) >= 'A' && (c) <= 'Z') || \
3489 ((c) >= 'a' && (c) <= 'z') || \
3490 ((c) >= '0' && (c) <= '9') || \
3491 (c) == '+' || (c) == '/')
3492
3493/* given that c is a base-64 character, what is its base-64 value? */
3494
3495#define FROM_BASE64(c) \
3496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3499 (c) == '+' ? 62 : 63)
3500
3501/* What is the base-64 character of the bottom 6 bits of n? */
3502
3503#define TO_BASE64(n) \
3504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3505
3506/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3507 * decoded as itself. We are permissive on decoding; the only ASCII
3508 * byte not decoding to itself is the + which begins a base64
3509 * string. */
3510
3511#define DECODE_DIRECT(c) \
3512 ((c) <= 127 && (c) != '+')
3513
3514/* The UTF-7 encoder treats ASCII characters differently according to
3515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3516 * the above). See RFC2152. This array identifies these different
3517 * sets:
3518 * 0 : "Set D"
3519 * alphanumeric and '(),-./:?
3520 * 1 : "Set O"
3521 * !"#$%&*;<=>@[]^_`{|}
3522 * 2 : "whitespace"
3523 * ht nl cr sp
3524 * 3 : special (must be base64 encoded)
3525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3526 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003527
Tim Petersced69f82003-09-16 20:30:58 +00003528static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003529char utf7_category[128] = {
3530/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3532/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3534/* sp ! " # $ % & ' ( ) * + , - . / */
3535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3536/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3538/* @ A B C D E F G H I J K L M N O */
3539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3540/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3542/* ` a b c d e f g h i j k l m n o */
3543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3544/* p q r s t u v w x y z { | } ~ del */
3545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003546};
3547
Antoine Pitrou244651a2009-05-04 18:56:13 +00003548/* ENCODE_DIRECT: this character should be encoded as itself. The
3549 * answer depends on whether we are encoding set O as itself, and also
3550 * on whether we are encoding whitespace as itself. RFC2152 makes it
3551 * clear that the answers to these questions vary between
3552 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003553
Antoine Pitrou244651a2009-05-04 18:56:13 +00003554#define ENCODE_DIRECT(c, directO, directWS) \
3555 ((c) < 128 && (c) > 0 && \
3556 ((utf7_category[(c)] == 0) || \
3557 (directWS && (utf7_category[(c)] == 2)) || \
3558 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003559
Alexander Belopolsky40018472011-02-26 01:02:56 +00003560PyObject *
3561PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003562 Py_ssize_t size,
3563 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003564{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003565 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3566}
3567
Antoine Pitrou244651a2009-05-04 18:56:13 +00003568/* The decoder. The only state we preserve is our read position,
3569 * i.e. how many characters we have consumed. So if we end in the
3570 * middle of a shift sequence we have to back off the read position
3571 * and the output to the beginning of the sequence, otherwise we lose
3572 * all the shift state (seen bits, number of bits seen, high
3573 * surrogate). */
3574
Alexander Belopolsky40018472011-02-26 01:02:56 +00003575PyObject *
3576PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003577 Py_ssize_t size,
3578 const char *errors,
3579 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003580{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003582 Py_ssize_t startinpos;
3583 Py_ssize_t endinpos;
3584 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585 const char *e;
3586 PyUnicodeObject *unicode;
3587 Py_UNICODE *p;
3588 const char *errmsg = "";
3589 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003590 Py_UNICODE *shiftOutStart;
3591 unsigned int base64bits = 0;
3592 unsigned long base64buffer = 0;
3593 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 PyObject *errorHandler = NULL;
3595 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003596
3597 unicode = _PyUnicode_New(size);
3598 if (!unicode)
3599 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003600 if (size == 0) {
3601 if (consumed)
3602 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003603 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003604 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003606 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003607 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003608 e = s + size;
3609
3610 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003612 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003613 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003614
Antoine Pitrou244651a2009-05-04 18:56:13 +00003615 if (inShift) { /* in a base-64 section */
3616 if (IS_BASE64(ch)) { /* consume a base-64 character */
3617 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3618 base64bits += 6;
3619 s++;
3620 if (base64bits >= 16) {
3621 /* we have enough bits for a UTF-16 value */
3622 Py_UNICODE outCh = (Py_UNICODE)
3623 (base64buffer >> (base64bits-16));
3624 base64bits -= 16;
3625 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3626 if (surrogate) {
3627 /* expecting a second surrogate */
3628 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3629#ifdef Py_UNICODE_WIDE
3630 *p++ = (((surrogate & 0x3FF)<<10)
3631 | (outCh & 0x3FF)) + 0x10000;
3632#else
3633 *p++ = surrogate;
3634 *p++ = outCh;
3635#endif
3636 surrogate = 0;
3637 }
3638 else {
3639 surrogate = 0;
3640 errmsg = "second surrogate missing";
3641 goto utf7Error;
3642 }
3643 }
3644 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3645 /* first surrogate */
3646 surrogate = outCh;
3647 }
3648 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3649 errmsg = "unexpected second surrogate";
3650 goto utf7Error;
3651 }
3652 else {
3653 *p++ = outCh;
3654 }
3655 }
3656 }
3657 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003658 inShift = 0;
3659 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003660 if (surrogate) {
3661 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003662 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003663 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003664 if (base64bits > 0) { /* left-over bits */
3665 if (base64bits >= 6) {
3666 /* We've seen at least one base-64 character */
3667 errmsg = "partial character in shift sequence";
3668 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003669 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003670 else {
3671 /* Some bits remain; they should be zero */
3672 if (base64buffer != 0) {
3673 errmsg = "non-zero padding bits in shift sequence";
3674 goto utf7Error;
3675 }
3676 }
3677 }
3678 if (ch != '-') {
3679 /* '-' is absorbed; other terminating
3680 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003681 *p++ = ch;
3682 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003683 }
3684 }
3685 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003687 s++; /* consume '+' */
3688 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003689 s++;
3690 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003691 }
3692 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003693 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003694 shiftOutStart = p;
3695 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003696 }
3697 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003698 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003699 *p++ = ch;
3700 s++;
3701 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003702 else {
3703 startinpos = s-starts;
3704 s++;
3705 errmsg = "unexpected special character";
3706 goto utf7Error;
3707 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003708 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003709utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 outpos = p-PyUnicode_AS_UNICODE(unicode);
3711 endinpos = s-starts;
3712 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 errors, &errorHandler,
3714 "utf7", errmsg,
3715 &starts, &e, &startinpos, &endinpos, &exc, &s,
3716 &unicode, &outpos, &p))
3717 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003718 }
3719
Antoine Pitrou244651a2009-05-04 18:56:13 +00003720 /* end of string */
3721
3722 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3723 /* if we're in an inconsistent state, that's an error */
3724 if (surrogate ||
3725 (base64bits >= 6) ||
3726 (base64bits > 0 && base64buffer != 0)) {
3727 outpos = p-PyUnicode_AS_UNICODE(unicode);
3728 endinpos = size;
3729 if (unicode_decode_call_errorhandler(
3730 errors, &errorHandler,
3731 "utf7", "unterminated shift sequence",
3732 &starts, &e, &startinpos, &endinpos, &exc, &s,
3733 &unicode, &outpos, &p))
3734 goto onError;
3735 if (s < e)
3736 goto restart;
3737 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003739
3740 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003741 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003742 if (inShift) {
3743 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003744 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003745 }
3746 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003747 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003748 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003749 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003750
Victor Stinnerfe226c02011-10-03 03:52:20 +02003751 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003752 goto onError;
3753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 Py_XDECREF(errorHandler);
3755 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003756#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003757 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 Py_DECREF(unicode);
3759 return NULL;
3760 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003761#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003762 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003763 return (PyObject *)unicode;
3764
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 Py_XDECREF(errorHandler);
3767 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003768 Py_DECREF(unicode);
3769 return NULL;
3770}
3771
3772
Alexander Belopolsky40018472011-02-26 01:02:56 +00003773PyObject *
3774PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003775 Py_ssize_t size,
3776 int base64SetO,
3777 int base64WhiteSpace,
3778 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003779{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003780 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003781 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003782 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003783 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003784 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003785 unsigned int base64bits = 0;
3786 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003787 char * out;
3788 char * start;
3789
3790 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003791 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003792
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003793 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003794 return PyErr_NoMemory();
3795
Antoine Pitrou244651a2009-05-04 18:56:13 +00003796 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003797 if (v == NULL)
3798 return NULL;
3799
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003800 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003801 for (;i < size; ++i) {
3802 Py_UNICODE ch = s[i];
3803
Antoine Pitrou244651a2009-05-04 18:56:13 +00003804 if (inShift) {
3805 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3806 /* shifting out */
3807 if (base64bits) { /* output remaining bits */
3808 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3809 base64buffer = 0;
3810 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003811 }
3812 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003813 /* Characters not in the BASE64 set implicitly unshift the sequence
3814 so no '-' is required, except if the character is itself a '-' */
3815 if (IS_BASE64(ch) || ch == '-') {
3816 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003818 *out++ = (char) ch;
3819 }
3820 else {
3821 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003822 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003823 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003824 else { /* not in a shift sequence */
3825 if (ch == '+') {
3826 *out++ = '+';
3827 *out++ = '-';
3828 }
3829 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3830 *out++ = (char) ch;
3831 }
3832 else {
3833 *out++ = '+';
3834 inShift = 1;
3835 goto encode_char;
3836 }
3837 }
3838 continue;
3839encode_char:
3840#ifdef Py_UNICODE_WIDE
3841 if (ch >= 0x10000) {
3842 /* code first surrogate */
3843 base64bits += 16;
3844 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3845 while (base64bits >= 6) {
3846 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3847 base64bits -= 6;
3848 }
3849 /* prepare second surrogate */
3850 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3851 }
3852#endif
3853 base64bits += 16;
3854 base64buffer = (base64buffer << 16) | ch;
3855 while (base64bits >= 6) {
3856 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3857 base64bits -= 6;
3858 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003859 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003860 if (base64bits)
3861 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3862 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003863 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003864 if (_PyBytes_Resize(&v, out - start) < 0)
3865 return NULL;
3866 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003867}
3868
Antoine Pitrou244651a2009-05-04 18:56:13 +00003869#undef IS_BASE64
3870#undef FROM_BASE64
3871#undef TO_BASE64
3872#undef DECODE_DIRECT
3873#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875/* --- UTF-8 Codec -------------------------------------------------------- */
3876
Tim Petersced69f82003-09-16 20:30:58 +00003877static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003879 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3880 illegal prefix. See RFC 3629 for details */
3881 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3882 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003883 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3885 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3886 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3887 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003888 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3889 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3893 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3894 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3895 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3896 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897};
3898
Alexander Belopolsky40018472011-02-26 01:02:56 +00003899PyObject *
3900PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003901 Py_ssize_t size,
3902 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903{
Walter Dörwald69652032004-09-07 20:24:22 +00003904 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3905}
3906
Antoine Pitrouab868312009-01-10 15:40:25 +00003907/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3908#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3909
3910/* Mask to quickly check whether a C 'long' contains a
3911 non-ASCII, UTF8-encoded char. */
3912#if (SIZEOF_LONG == 8)
3913# define ASCII_CHAR_MASK 0x8080808080808080L
3914#elif (SIZEOF_LONG == 4)
3915# define ASCII_CHAR_MASK 0x80808080L
3916#else
3917# error C 'long' size should be either 4 or 8!
3918#endif
3919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920/* Scans a UTF-8 string and returns the maximum character to be expected,
3921 the size of the decoded unicode string and if any major errors were
3922 encountered.
3923
3924 This function does check basic UTF-8 sanity, it does however NOT CHECK
3925 if the string contains surrogates, and if all continuation bytes are
3926 within the correct ranges, these checks are performed in
3927 PyUnicode_DecodeUTF8Stateful.
3928
3929 If it sets has_errors to 1, it means the value of unicode_size and max_char
3930 will be bogus and you should not rely on useful information in them.
3931 */
3932static Py_UCS4
3933utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3934 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3935 int *has_errors)
3936{
3937 Py_ssize_t n;
3938 Py_ssize_t char_count = 0;
3939 Py_UCS4 max_char = 127, new_max;
3940 Py_UCS4 upper_bound;
3941 const unsigned char *p = (const unsigned char *)s;
3942 const unsigned char *end = p + string_size;
3943 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3944 int err = 0;
3945
3946 for (; p < end && !err; ++p, ++char_count) {
3947 /* Only check value if it's not a ASCII char... */
3948 if (*p < 0x80) {
3949 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3950 an explanation. */
3951 if (!((size_t) p & LONG_PTR_MASK)) {
3952 /* Help register allocation */
3953 register const unsigned char *_p = p;
3954 while (_p < aligned_end) {
3955 unsigned long value = *(unsigned long *) _p;
3956 if (value & ASCII_CHAR_MASK)
3957 break;
3958 _p += SIZEOF_LONG;
3959 char_count += SIZEOF_LONG;
3960 }
3961 p = _p;
3962 if (p == end)
3963 break;
3964 }
3965 }
3966 if (*p >= 0x80) {
3967 n = utf8_code_length[*p];
3968 new_max = max_char;
3969 switch (n) {
3970 /* invalid start byte */
3971 case 0:
3972 err = 1;
3973 break;
3974 case 2:
3975 /* Code points between 0x00FF and 0x07FF inclusive.
3976 Approximate the upper bound of the code point,
3977 if this flips over 255 we can be sure it will be more
3978 than 255 and the string will need 2 bytes per code coint,
3979 if it stays under or equal to 255, we can be sure 1 byte
3980 is enough.
3981 ((*p & 0b00011111) << 6) | 0b00111111 */
3982 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3983 if (max_char < upper_bound)
3984 new_max = upper_bound;
3985 /* Ensure we track at least that we left ASCII space. */
3986 if (new_max < 128)
3987 new_max = 128;
3988 break;
3989 case 3:
3990 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3991 always > 255 and <= 65535 and will always need 2 bytes. */
3992 if (max_char < 65535)
3993 new_max = 65535;
3994 break;
3995 case 4:
3996 /* Code point will be above 0xFFFF for sure in this case. */
3997 new_max = 65537;
3998 break;
3999 /* Internal error, this should be caught by the first if */
4000 case 1:
4001 default:
4002 assert(0 && "Impossible case in utf8_max_char_and_size");
4003 err = 1;
4004 }
4005 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004006 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 --n;
4008 /* Check if the follow up chars are all valid continuation bytes */
4009 if (n >= 1) {
4010 const unsigned char *cont;
4011 if ((p + n) >= end) {
4012 if (consumed == 0)
4013 /* incomplete data, non-incremental decoding */
4014 err = 1;
4015 break;
4016 }
4017 for (cont = p + 1; cont < (p + n); ++cont) {
4018 if ((*cont & 0xc0) != 0x80) {
4019 err = 1;
4020 break;
4021 }
4022 }
4023 p += n;
4024 }
4025 else
4026 err = 1;
4027 max_char = new_max;
4028 }
4029 }
4030
4031 if (unicode_size)
4032 *unicode_size = char_count;
4033 if (has_errors)
4034 *has_errors = err;
4035 return max_char;
4036}
4037
4038/* Similar to PyUnicode_WRITE but can also write into wstr field
4039 of the legacy unicode representation */
4040#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4041 do { \
4042 const int k_ = (kind); \
4043 if (k_ == PyUnicode_WCHAR_KIND) \
4044 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4045 else if (k_ == PyUnicode_1BYTE_KIND) \
4046 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4047 else if (k_ == PyUnicode_2BYTE_KIND) \
4048 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4049 else \
4050 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4051 } while (0)
4052
Alexander Belopolsky40018472011-02-26 01:02:56 +00004053PyObject *
4054PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 Py_ssize_t size,
4056 const char *errors,
4057 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004061 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004062 Py_ssize_t startinpos;
4063 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004064 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004066 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 PyObject *errorHandler = NULL;
4068 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 Py_UCS4 maxchar = 0;
4070 Py_ssize_t unicode_size;
4071 Py_ssize_t i;
4072 int kind;
4073 void *data;
4074 int has_errors;
4075 Py_UNICODE *error_outptr;
4076#if SIZEOF_WCHAR_T == 2
4077 Py_ssize_t wchar_offset = 0;
4078#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079
Walter Dörwald69652032004-09-07 20:24:22 +00004080 if (size == 0) {
4081 if (consumed)
4082 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4086 consumed, &has_errors);
4087 if (has_errors) {
4088 unicode = _PyUnicode_New(size);
4089 if (!unicode)
4090 return NULL;
4091 kind = PyUnicode_WCHAR_KIND;
4092 data = PyUnicode_AS_UNICODE(unicode);
4093 assert(data != NULL);
4094 }
4095 else {
4096 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4097 if (!unicode)
4098 return NULL;
4099 /* When the string is ASCII only, just use memcpy and return.
4100 unicode_size may be != size if there is an incomplete UTF-8
4101 sequence at the end of the ASCII block. */
4102 if (maxchar < 128 && size == unicode_size) {
4103 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4104 return (PyObject *)unicode;
4105 }
4106 kind = PyUnicode_KIND(unicode);
4107 data = PyUnicode_DATA(unicode);
4108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004112 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113
4114 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004115 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116
4117 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004118 /* Fast path for runs of ASCII characters. Given that common UTF-8
4119 input will consist of an overwhelming majority of ASCII
4120 characters, we try to optimize for this case by checking
4121 as many characters as a C 'long' can contain.
4122 First, check if we can do an aligned read, as most CPUs have
4123 a penalty for unaligned reads.
4124 */
4125 if (!((size_t) s & LONG_PTR_MASK)) {
4126 /* Help register allocation */
4127 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004129 while (_s < aligned_end) {
4130 /* Read a whole long at a time (either 4 or 8 bytes),
4131 and do a fast unrolled copy if it only contains ASCII
4132 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 unsigned long value = *(unsigned long *) _s;
4134 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004135 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4137 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4138 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4139 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004140#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4142 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4143 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4144 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004145#endif
4146 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004148 }
4149 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004151 if (s == e)
4152 break;
4153 ch = (unsigned char)*s;
4154 }
4155 }
4156
4157 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 s++;
4160 continue;
4161 }
4162
4163 n = utf8_code_length[ch];
4164
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004165 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 if (consumed)
4167 break;
4168 else {
4169 errmsg = "unexpected end of data";
4170 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004171 endinpos = startinpos+1;
4172 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4173 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 goto utf8Error;
4175 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177
4178 switch (n) {
4179
4180 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004181 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 startinpos = s-starts;
4183 endinpos = startinpos+1;
4184 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
4186 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004187 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 startinpos = s-starts;
4189 endinpos = startinpos+1;
4190 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191
4192 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004193 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004194 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004196 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 goto utf8Error;
4198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004200 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 break;
4203
4204 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004205 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4206 will result in surrogates in range d800-dfff. Surrogates are
4207 not valid UTF-8 so they are rejected.
4208 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4209 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004210 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004211 (s[2] & 0xc0) != 0x80 ||
4212 ((unsigned char)s[0] == 0xE0 &&
4213 (unsigned char)s[1] < 0xA0) ||
4214 ((unsigned char)s[0] == 0xED &&
4215 (unsigned char)s[1] > 0x9F)) {
4216 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004218 endinpos = startinpos + 1;
4219
4220 /* if s[1] first two bits are 1 and 0, then the invalid
4221 continuation byte is s[2], so increment endinpos by 1,
4222 if not, s[1] is invalid and endinpos doesn't need to
4223 be incremented. */
4224 if ((s[1] & 0xC0) == 0x80)
4225 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 goto utf8Error;
4227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004229 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004231 break;
4232
4233 case 4:
4234 if ((s[1] & 0xc0) != 0x80 ||
4235 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004236 (s[3] & 0xc0) != 0x80 ||
4237 ((unsigned char)s[0] == 0xF0 &&
4238 (unsigned char)s[1] < 0x90) ||
4239 ((unsigned char)s[0] == 0xF4 &&
4240 (unsigned char)s[1] > 0x8F)) {
4241 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004242 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004243 endinpos = startinpos + 1;
4244 if ((s[1] & 0xC0) == 0x80) {
4245 endinpos++;
4246 if ((s[2] & 0xC0) == 0x80)
4247 endinpos++;
4248 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 goto utf8Error;
4250 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004251 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004252 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4253 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004255 /* If the string is flexible or we have native UCS-4, write
4256 directly.. */
4257 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4258 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 else {
4261 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 /* translate from 10000..10FFFF to 0..FFFF */
4264 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004266 /* high surrogate = top 10 bits added to D800 */
4267 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4268 (Py_UNICODE)(0xD800 + (ch >> 10)));
4269
4270 /* low surrogate = bottom 10 bits added to DC00 */
4271 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4272 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4273 }
4274#if SIZEOF_WCHAR_T == 2
4275 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004276#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 }
4279 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004281
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004283 /* If this is not yet a resizable string, make it one.. */
4284 if (kind != PyUnicode_WCHAR_KIND) {
4285 const Py_UNICODE *u;
4286 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4287 if (!new_unicode)
4288 goto onError;
4289 u = PyUnicode_AsUnicode((PyObject *)unicode);
4290 if (!u)
4291 goto onError;
4292#if SIZEOF_WCHAR_T == 2
4293 i += wchar_offset;
4294#endif
4295 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4296 Py_DECREF(unicode);
4297 unicode = new_unicode;
4298 kind = 0;
4299 data = PyUnicode_AS_UNICODE(new_unicode);
4300 assert(data != NULL);
4301 }
4302 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 if (unicode_decode_call_errorhandler(
4304 errors, &errorHandler,
4305 "utf8", errmsg,
4306 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004307 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004309 /* Update data because unicode_decode_call_errorhandler might have
4310 re-created or resized the unicode object. */
4311 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004314 /* Ensure the unicode_size calculation above was correct: */
4315 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4316
Walter Dörwald69652032004-09-07 20:24:22 +00004317 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004320 /* Adjust length and ready string when it contained errors and
4321 is of the old resizable kind. */
4322 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004323 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004324 goto onError;
4325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 Py_XDECREF(errorHandler);
4328 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004329#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004330 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331 Py_DECREF(unicode);
4332 return NULL;
4333 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004334#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004335 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 return (PyObject *)unicode;
4337
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339 Py_XDECREF(errorHandler);
4340 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 Py_DECREF(unicode);
4342 return NULL;
4343}
4344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004345#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004346
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004347#ifdef __APPLE__
4348
4349/* Simplified UTF-8 decoder using surrogateescape error handler,
4350 used to decode the command line arguments on Mac OS X. */
4351
4352wchar_t*
4353_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4354{
4355 int n;
4356 const char *e;
4357 wchar_t *unicode, *p;
4358
4359 /* Note: size will always be longer than the resulting Unicode
4360 character count */
4361 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4362 PyErr_NoMemory();
4363 return NULL;
4364 }
4365 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4366 if (!unicode)
4367 return NULL;
4368
4369 /* Unpack UTF-8 encoded data */
4370 p = unicode;
4371 e = s + size;
4372 while (s < e) {
4373 Py_UCS4 ch = (unsigned char)*s;
4374
4375 if (ch < 0x80) {
4376 *p++ = (wchar_t)ch;
4377 s++;
4378 continue;
4379 }
4380
4381 n = utf8_code_length[ch];
4382 if (s + n > e) {
4383 goto surrogateescape;
4384 }
4385
4386 switch (n) {
4387 case 0:
4388 case 1:
4389 goto surrogateescape;
4390
4391 case 2:
4392 if ((s[1] & 0xc0) != 0x80)
4393 goto surrogateescape;
4394 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4395 assert ((ch > 0x007F) && (ch <= 0x07FF));
4396 *p++ = (wchar_t)ch;
4397 break;
4398
4399 case 3:
4400 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4401 will result in surrogates in range d800-dfff. Surrogates are
4402 not valid UTF-8 so they are rejected.
4403 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4404 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4405 if ((s[1] & 0xc0) != 0x80 ||
4406 (s[2] & 0xc0) != 0x80 ||
4407 ((unsigned char)s[0] == 0xE0 &&
4408 (unsigned char)s[1] < 0xA0) ||
4409 ((unsigned char)s[0] == 0xED &&
4410 (unsigned char)s[1] > 0x9F)) {
4411
4412 goto surrogateescape;
4413 }
4414 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4415 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004416 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004417 break;
4418
4419 case 4:
4420 if ((s[1] & 0xc0) != 0x80 ||
4421 (s[2] & 0xc0) != 0x80 ||
4422 (s[3] & 0xc0) != 0x80 ||
4423 ((unsigned char)s[0] == 0xF0 &&
4424 (unsigned char)s[1] < 0x90) ||
4425 ((unsigned char)s[0] == 0xF4 &&
4426 (unsigned char)s[1] > 0x8F)) {
4427 goto surrogateescape;
4428 }
4429 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4430 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4431 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4432
4433#if SIZEOF_WCHAR_T == 4
4434 *p++ = (wchar_t)ch;
4435#else
4436 /* compute and append the two surrogates: */
4437
4438 /* translate from 10000..10FFFF to 0..FFFF */
4439 ch -= 0x10000;
4440
4441 /* high surrogate = top 10 bits added to D800 */
4442 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4443
4444 /* low surrogate = bottom 10 bits added to DC00 */
4445 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4446#endif
4447 break;
4448 }
4449 s += n;
4450 continue;
4451
4452 surrogateescape:
4453 *p++ = 0xDC00 + ch;
4454 s++;
4455 }
4456 *p = L'\0';
4457 return unicode;
4458}
4459
4460#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462/* Primary internal function which creates utf8 encoded bytes objects.
4463
4464 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004465 and allocate exactly as much space needed at the end. Else allocate the
4466 maximum possible needed (4 result bytes per Unicode character), and return
4467 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004468*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004469PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471{
Tim Peters602f7402002-04-27 18:03:26 +00004472#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004473
Guido van Rossum98297ee2007-11-06 21:34:58 +00004474 Py_ssize_t i; /* index into s of next input byte */
4475 PyObject *result; /* result string object */
4476 char *p; /* next free byte in output buffer */
4477 Py_ssize_t nallocated; /* number of result bytes allocated */
4478 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004479 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004480 PyObject *errorHandler = NULL;
4481 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004482 int kind;
4483 void *data;
4484 Py_ssize_t size;
4485 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4486#if SIZEOF_WCHAR_T == 2
4487 Py_ssize_t wchar_offset = 0;
4488#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004490 if (!PyUnicode_Check(unicode)) {
4491 PyErr_BadArgument();
4492 return NULL;
4493 }
4494
4495 if (PyUnicode_READY(unicode) == -1)
4496 return NULL;
4497
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004498 if (PyUnicode_UTF8(unicode))
4499 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4500 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004501
4502 kind = PyUnicode_KIND(unicode);
4503 data = PyUnicode_DATA(unicode);
4504 size = PyUnicode_GET_LENGTH(unicode);
4505
Tim Peters602f7402002-04-27 18:03:26 +00004506 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507
Tim Peters602f7402002-04-27 18:03:26 +00004508 if (size <= MAX_SHORT_UNICHARS) {
4509 /* Write into the stack buffer; nallocated can't overflow.
4510 * At the end, we'll allocate exactly as much heap space as it
4511 * turns out we need.
4512 */
4513 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004514 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004515 p = stackbuf;
4516 }
4517 else {
4518 /* Overallocate on the heap, and give the excess back at the end. */
4519 nallocated = size * 4;
4520 if (nallocated / 4 != size) /* overflow! */
4521 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004522 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004523 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004524 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004525 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004526 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004527
Tim Peters602f7402002-04-27 18:03:26 +00004528 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004529 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004530
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004531 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004532 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004534
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004536 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004537 *p++ = (char)(0xc0 | (ch >> 6));
4538 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004539 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004540 Py_ssize_t newpos;
4541 PyObject *rep;
4542 Py_ssize_t repsize, k, startpos;
4543 startpos = i-1;
4544#if SIZEOF_WCHAR_T == 2
4545 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004546#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004547 rep = unicode_encode_call_errorhandler(
4548 errors, &errorHandler, "utf-8", "surrogates not allowed",
4549 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4550 &exc, startpos, startpos+1, &newpos);
4551 if (!rep)
4552 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004554 if (PyBytes_Check(rep))
4555 repsize = PyBytes_GET_SIZE(rep);
4556 else
4557 repsize = PyUnicode_GET_SIZE(rep);
4558
4559 if (repsize > 4) {
4560 Py_ssize_t offset;
4561
4562 if (result == NULL)
4563 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004564 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004565 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004567 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4568 /* integer overflow */
4569 PyErr_NoMemory();
4570 goto error;
4571 }
4572 nallocated += repsize - 4;
4573 if (result != NULL) {
4574 if (_PyBytes_Resize(&result, nallocated) < 0)
4575 goto error;
4576 } else {
4577 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004578 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579 goto error;
4580 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4581 }
4582 p = PyBytes_AS_STRING(result) + offset;
4583 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004585 if (PyBytes_Check(rep)) {
4586 char *prep = PyBytes_AS_STRING(rep);
4587 for(k = repsize; k > 0; k--)
4588 *p++ = *prep++;
4589 } else /* rep is unicode */ {
4590 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4591 Py_UNICODE c;
4592
4593 for(k=0; k<repsize; k++) {
4594 c = prep[k];
4595 if (0x80 <= c) {
4596 raise_encode_exception(&exc, "utf-8",
4597 PyUnicode_AS_UNICODE(unicode),
4598 size, i-1, i,
4599 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004600 goto error;
4601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004602 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004603 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004605 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004606 } else if (ch < 0x10000) {
4607 *p++ = (char)(0xe0 | (ch >> 12));
4608 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4609 *p++ = (char)(0x80 | (ch & 0x3f));
4610 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004611 /* Encode UCS4 Unicode ordinals */
4612 *p++ = (char)(0xf0 | (ch >> 18));
4613 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4614 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4615 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616#if SIZEOF_WCHAR_T == 2
4617 wchar_offset++;
4618#endif
Tim Peters602f7402002-04-27 18:03:26 +00004619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004621
Guido van Rossum98297ee2007-11-06 21:34:58 +00004622 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004623 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004624 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004625 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004626 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004627 }
4628 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004629 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004630 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004631 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004632 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004634
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004635 Py_XDECREF(errorHandler);
4636 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004637 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004638 error:
4639 Py_XDECREF(errorHandler);
4640 Py_XDECREF(exc);
4641 Py_XDECREF(result);
4642 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004643
Tim Peters602f7402002-04-27 18:03:26 +00004644#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645}
4646
Alexander Belopolsky40018472011-02-26 01:02:56 +00004647PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004648PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4649 Py_ssize_t size,
4650 const char *errors)
4651{
4652 PyObject *v, *unicode;
4653
4654 unicode = PyUnicode_FromUnicode(s, size);
4655 if (unicode == NULL)
4656 return NULL;
4657 v = _PyUnicode_AsUTF8String(unicode, errors);
4658 Py_DECREF(unicode);
4659 return v;
4660}
4661
4662PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004663PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004665 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666}
4667
Walter Dörwald41980ca2007-08-16 21:55:45 +00004668/* --- UTF-32 Codec ------------------------------------------------------- */
4669
4670PyObject *
4671PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 Py_ssize_t size,
4673 const char *errors,
4674 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004675{
4676 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4677}
4678
4679PyObject *
4680PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 Py_ssize_t size,
4682 const char *errors,
4683 int *byteorder,
4684 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004685{
4686 const char *starts = s;
4687 Py_ssize_t startinpos;
4688 Py_ssize_t endinpos;
4689 Py_ssize_t outpos;
4690 PyUnicodeObject *unicode;
4691 Py_UNICODE *p;
4692#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004693 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004694 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004695#else
4696 const int pairs = 0;
4697#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004698 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004699 int bo = 0; /* assume native ordering by default */
4700 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004701 /* Offsets from q for retrieving bytes in the right order. */
4702#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4703 int iorder[] = {0, 1, 2, 3};
4704#else
4705 int iorder[] = {3, 2, 1, 0};
4706#endif
4707 PyObject *errorHandler = NULL;
4708 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004709
Walter Dörwald41980ca2007-08-16 21:55:45 +00004710 q = (unsigned char *)s;
4711 e = q + size;
4712
4713 if (byteorder)
4714 bo = *byteorder;
4715
4716 /* Check for BOM marks (U+FEFF) in the input and adjust current
4717 byte order setting accordingly. In native mode, the leading BOM
4718 mark is skipped, in all other modes, it is copied to the output
4719 stream as-is (giving a ZWNBSP character). */
4720 if (bo == 0) {
4721 if (size >= 4) {
4722 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004724#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 if (bom == 0x0000FEFF) {
4726 q += 4;
4727 bo = -1;
4728 }
4729 else if (bom == 0xFFFE0000) {
4730 q += 4;
4731 bo = 1;
4732 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004733#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 if (bom == 0x0000FEFF) {
4735 q += 4;
4736 bo = 1;
4737 }
4738 else if (bom == 0xFFFE0000) {
4739 q += 4;
4740 bo = -1;
4741 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004742#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004744 }
4745
4746 if (bo == -1) {
4747 /* force LE */
4748 iorder[0] = 0;
4749 iorder[1] = 1;
4750 iorder[2] = 2;
4751 iorder[3] = 3;
4752 }
4753 else if (bo == 1) {
4754 /* force BE */
4755 iorder[0] = 3;
4756 iorder[1] = 2;
4757 iorder[2] = 1;
4758 iorder[3] = 0;
4759 }
4760
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004761 /* On narrow builds we split characters outside the BMP into two
4762 codepoints => count how much extra space we need. */
4763#ifndef Py_UNICODE_WIDE
4764 for (qq = q; qq < e; qq += 4)
4765 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4766 pairs++;
4767#endif
4768
4769 /* This might be one to much, because of a BOM */
4770 unicode = _PyUnicode_New((size+3)/4+pairs);
4771 if (!unicode)
4772 return NULL;
4773 if (size == 0)
4774 return (PyObject *)unicode;
4775
4776 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004778
Walter Dörwald41980ca2007-08-16 21:55:45 +00004779 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 Py_UCS4 ch;
4781 /* remaining bytes at the end? (size should be divisible by 4) */
4782 if (e-q<4) {
4783 if (consumed)
4784 break;
4785 errmsg = "truncated data";
4786 startinpos = ((const char *)q)-starts;
4787 endinpos = ((const char *)e)-starts;
4788 goto utf32Error;
4789 /* The remaining input chars are ignored if the callback
4790 chooses to skip the input */
4791 }
4792 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4793 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 if (ch >= 0x110000)
4796 {
4797 errmsg = "codepoint not in range(0x110000)";
4798 startinpos = ((const char *)q)-starts;
4799 endinpos = startinpos+4;
4800 goto utf32Error;
4801 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004802#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 if (ch >= 0x10000)
4804 {
4805 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4806 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4807 }
4808 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004809#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 *p++ = ch;
4811 q += 4;
4812 continue;
4813 utf32Error:
4814 outpos = p-PyUnicode_AS_UNICODE(unicode);
4815 if (unicode_decode_call_errorhandler(
4816 errors, &errorHandler,
4817 "utf32", errmsg,
4818 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4819 &unicode, &outpos, &p))
4820 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004821 }
4822
4823 if (byteorder)
4824 *byteorder = bo;
4825
4826 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004827 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004828
4829 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004830 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004831 goto onError;
4832
4833 Py_XDECREF(errorHandler);
4834 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004835#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004836 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004837 Py_DECREF(unicode);
4838 return NULL;
4839 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004840#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004841 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004842 return (PyObject *)unicode;
4843
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004845 Py_DECREF(unicode);
4846 Py_XDECREF(errorHandler);
4847 Py_XDECREF(exc);
4848 return NULL;
4849}
4850
4851PyObject *
4852PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 Py_ssize_t size,
4854 const char *errors,
4855 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004856{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004857 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004858 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004859 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004860#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004861 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004862#else
4863 const int pairs = 0;
4864#endif
4865 /* Offsets from p for storing byte pairs in the right order. */
4866#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4867 int iorder[] = {0, 1, 2, 3};
4868#else
4869 int iorder[] = {3, 2, 1, 0};
4870#endif
4871
Benjamin Peterson29060642009-01-31 22:14:21 +00004872#define STORECHAR(CH) \
4873 do { \
4874 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4875 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4876 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4877 p[iorder[0]] = (CH) & 0xff; \
4878 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004879 } while(0)
4880
4881 /* In narrow builds we can output surrogate pairs as one codepoint,
4882 so we need less space. */
4883#ifndef Py_UNICODE_WIDE
4884 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4886 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4887 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004888#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004889 nsize = (size - pairs + (byteorder == 0));
4890 bytesize = nsize * 4;
4891 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004893 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004894 if (v == NULL)
4895 return NULL;
4896
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004897 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004898 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004900 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004901 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004902
4903 if (byteorder == -1) {
4904 /* force LE */
4905 iorder[0] = 0;
4906 iorder[1] = 1;
4907 iorder[2] = 2;
4908 iorder[3] = 3;
4909 }
4910 else if (byteorder == 1) {
4911 /* force BE */
4912 iorder[0] = 3;
4913 iorder[1] = 2;
4914 iorder[2] = 1;
4915 iorder[3] = 0;
4916 }
4917
4918 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4922 Py_UCS4 ch2 = *s;
4923 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4924 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4925 s++;
4926 size--;
4927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004928 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929#endif
4930 STORECHAR(ch);
4931 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004932
4933 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004934 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935#undef STORECHAR
4936}
4937
Alexander Belopolsky40018472011-02-26 01:02:56 +00004938PyObject *
4939PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940{
4941 if (!PyUnicode_Check(unicode)) {
4942 PyErr_BadArgument();
4943 return NULL;
4944 }
4945 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 PyUnicode_GET_SIZE(unicode),
4947 NULL,
4948 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949}
4950
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951/* --- UTF-16 Codec ------------------------------------------------------- */
4952
Tim Peters772747b2001-08-09 22:21:55 +00004953PyObject *
4954PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958{
Walter Dörwald69652032004-09-07 20:24:22 +00004959 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4960}
4961
Antoine Pitrouab868312009-01-10 15:40:25 +00004962/* Two masks for fast checking of whether a C 'long' may contain
4963 UTF16-encoded surrogate characters. This is an efficient heuristic,
4964 assuming that non-surrogate characters with a code point >= 0x8000 are
4965 rare in most input.
4966 FAST_CHAR_MASK is used when the input is in native byte ordering,
4967 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004968*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004969#if (SIZEOF_LONG == 8)
4970# define FAST_CHAR_MASK 0x8000800080008000L
4971# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4972#elif (SIZEOF_LONG == 4)
4973# define FAST_CHAR_MASK 0x80008000L
4974# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4975#else
4976# error C 'long' size should be either 4 or 8!
4977#endif
4978
Walter Dörwald69652032004-09-07 20:24:22 +00004979PyObject *
4980PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 Py_ssize_t size,
4982 const char *errors,
4983 int *byteorder,
4984 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004985{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004987 Py_ssize_t startinpos;
4988 Py_ssize_t endinpos;
4989 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 PyUnicodeObject *unicode;
4991 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004992 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004993 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004994 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004995 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004996 /* Offsets from q for retrieving byte pairs in the right order. */
4997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4998 int ihi = 1, ilo = 0;
4999#else
5000 int ihi = 0, ilo = 1;
5001#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 PyObject *errorHandler = NULL;
5003 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004
5005 /* Note: size will always be longer than the resulting Unicode
5006 character count */
5007 unicode = _PyUnicode_New(size);
5008 if (!unicode)
5009 return NULL;
5010 if (size == 0)
5011 return (PyObject *)unicode;
5012
5013 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005014 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005015 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005016 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017
5018 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005019 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005021 /* Check for BOM marks (U+FEFF) in the input and adjust current
5022 byte order setting accordingly. In native mode, the leading BOM
5023 mark is skipped, in all other modes, it is copied to the output
5024 stream as-is (giving a ZWNBSP character). */
5025 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005026 if (size >= 2) {
5027 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005028#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 if (bom == 0xFEFF) {
5030 q += 2;
5031 bo = -1;
5032 }
5033 else if (bom == 0xFFFE) {
5034 q += 2;
5035 bo = 1;
5036 }
Tim Petersced69f82003-09-16 20:30:58 +00005037#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 if (bom == 0xFEFF) {
5039 q += 2;
5040 bo = 1;
5041 }
5042 else if (bom == 0xFFFE) {
5043 q += 2;
5044 bo = -1;
5045 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005046#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049
Tim Peters772747b2001-08-09 22:21:55 +00005050 if (bo == -1) {
5051 /* force LE */
5052 ihi = 1;
5053 ilo = 0;
5054 }
5055 else if (bo == 1) {
5056 /* force BE */
5057 ihi = 0;
5058 ilo = 1;
5059 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005060#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5061 native_ordering = ilo < ihi;
5062#else
5063 native_ordering = ilo > ihi;
5064#endif
Tim Peters772747b2001-08-09 22:21:55 +00005065
Antoine Pitrouab868312009-01-10 15:40:25 +00005066 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005067 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005069 /* First check for possible aligned read of a C 'long'. Unaligned
5070 reads are more expensive, better to defer to another iteration. */
5071 if (!((size_t) q & LONG_PTR_MASK)) {
5072 /* Fast path for runs of non-surrogate chars. */
5073 register const unsigned char *_q = q;
5074 Py_UNICODE *_p = p;
5075 if (native_ordering) {
5076 /* Native ordering is simple: as long as the input cannot
5077 possibly contain a surrogate char, do an unrolled copy
5078 of several 16-bit code points to the target object.
5079 The non-surrogate check is done on several input bytes
5080 at a time (as many as a C 'long' can contain). */
5081 while (_q < aligned_end) {
5082 unsigned long data = * (unsigned long *) _q;
5083 if (data & FAST_CHAR_MASK)
5084 break;
5085 _p[0] = ((unsigned short *) _q)[0];
5086 _p[1] = ((unsigned short *) _q)[1];
5087#if (SIZEOF_LONG == 8)
5088 _p[2] = ((unsigned short *) _q)[2];
5089 _p[3] = ((unsigned short *) _q)[3];
5090#endif
5091 _q += SIZEOF_LONG;
5092 _p += SIZEOF_LONG / 2;
5093 }
5094 }
5095 else {
5096 /* Byteswapped ordering is similar, but we must decompose
5097 the copy bytewise, and take care of zero'ing out the
5098 upper bytes if the target object is in 32-bit units
5099 (that is, in UCS-4 builds). */
5100 while (_q < aligned_end) {
5101 unsigned long data = * (unsigned long *) _q;
5102 if (data & SWAPPED_FAST_CHAR_MASK)
5103 break;
5104 /* Zero upper bytes in UCS-4 builds */
5105#if (Py_UNICODE_SIZE > 2)
5106 _p[0] = 0;
5107 _p[1] = 0;
5108#if (SIZEOF_LONG == 8)
5109 _p[2] = 0;
5110 _p[3] = 0;
5111#endif
5112#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005113 /* Issue #4916; UCS-4 builds on big endian machines must
5114 fill the two last bytes of each 4-byte unit. */
5115#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5116# define OFF 2
5117#else
5118# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005119#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005120 ((unsigned char *) _p)[OFF + 1] = _q[0];
5121 ((unsigned char *) _p)[OFF + 0] = _q[1];
5122 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5123 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5124#if (SIZEOF_LONG == 8)
5125 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5126 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5127 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5128 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5129#endif
5130#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005131 _q += SIZEOF_LONG;
5132 _p += SIZEOF_LONG / 2;
5133 }
5134 }
5135 p = _p;
5136 q = _q;
5137 if (q >= e)
5138 break;
5139 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141
Benjamin Peterson14339b62009-01-31 16:36:08 +00005142 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005143
5144 if (ch < 0xD800 || ch > 0xDFFF) {
5145 *p++ = ch;
5146 continue;
5147 }
5148
5149 /* UTF-16 code pair: */
5150 if (q > e) {
5151 errmsg = "unexpected end of data";
5152 startinpos = (((const char *)q) - 2) - starts;
5153 endinpos = ((const char *)e) + 1 - starts;
5154 goto utf16Error;
5155 }
5156 if (0xD800 <= ch && ch <= 0xDBFF) {
5157 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5158 q += 2;
5159 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005160#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 *p++ = ch;
5162 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005163#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005165#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 continue;
5167 }
5168 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005169 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 startinpos = (((const char *)q)-4)-starts;
5171 endinpos = startinpos+2;
5172 goto utf16Error;
5173 }
5174
Benjamin Peterson14339b62009-01-31 16:36:08 +00005175 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 errmsg = "illegal encoding";
5177 startinpos = (((const char *)q)-2)-starts;
5178 endinpos = startinpos+2;
5179 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005180
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 utf16Error:
5182 outpos = p - PyUnicode_AS_UNICODE(unicode);
5183 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005184 errors,
5185 &errorHandler,
5186 "utf16", errmsg,
5187 &starts,
5188 (const char **)&e,
5189 &startinpos,
5190 &endinpos,
5191 &exc,
5192 (const char **)&q,
5193 &unicode,
5194 &outpos,
5195 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005198 /* remaining byte at the end? (size should be even) */
5199 if (e == q) {
5200 if (!consumed) {
5201 errmsg = "truncated data";
5202 startinpos = ((const char *)q) - starts;
5203 endinpos = ((const char *)e) + 1 - starts;
5204 outpos = p - PyUnicode_AS_UNICODE(unicode);
5205 if (unicode_decode_call_errorhandler(
5206 errors,
5207 &errorHandler,
5208 "utf16", errmsg,
5209 &starts,
5210 (const char **)&e,
5211 &startinpos,
5212 &endinpos,
5213 &exc,
5214 (const char **)&q,
5215 &unicode,
5216 &outpos,
5217 &p))
5218 goto onError;
5219 /* The remaining input chars are ignored if the callback
5220 chooses to skip the input */
5221 }
5222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223
5224 if (byteorder)
5225 *byteorder = bo;
5226
Walter Dörwald69652032004-09-07 20:24:22 +00005227 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005229
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005231 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 goto onError;
5233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 Py_XDECREF(errorHandler);
5235 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005236#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005237 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005238 Py_DECREF(unicode);
5239 return NULL;
5240 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005241#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005242 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 return (PyObject *)unicode;
5244
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 Py_XDECREF(errorHandler);
5248 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 return NULL;
5250}
5251
Antoine Pitrouab868312009-01-10 15:40:25 +00005252#undef FAST_CHAR_MASK
5253#undef SWAPPED_FAST_CHAR_MASK
5254
Tim Peters772747b2001-08-09 22:21:55 +00005255PyObject *
5256PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 Py_ssize_t size,
5258 const char *errors,
5259 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005261 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005262 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005263 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005264#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005265 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005266#else
5267 const int pairs = 0;
5268#endif
Tim Peters772747b2001-08-09 22:21:55 +00005269 /* Offsets from p for storing byte pairs in the right order. */
5270#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5271 int ihi = 1, ilo = 0;
5272#else
5273 int ihi = 0, ilo = 1;
5274#endif
5275
Benjamin Peterson29060642009-01-31 22:14:21 +00005276#define STORECHAR(CH) \
5277 do { \
5278 p[ihi] = ((CH) >> 8) & 0xff; \
5279 p[ilo] = (CH) & 0xff; \
5280 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005281 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005283#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005284 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 if (s[i] >= 0x10000)
5286 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005287#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005288 /* 2 * (size + pairs + (byteorder == 0)) */
5289 if (size > PY_SSIZE_T_MAX ||
5290 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005292 nsize = size + pairs + (byteorder == 0);
5293 bytesize = nsize * 2;
5294 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005296 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 if (v == NULL)
5298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005300 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005303 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005304 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005305
5306 if (byteorder == -1) {
5307 /* force LE */
5308 ihi = 1;
5309 ilo = 0;
5310 }
5311 else if (byteorder == 1) {
5312 /* force BE */
5313 ihi = 0;
5314 ilo = 1;
5315 }
5316
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005317 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 Py_UNICODE ch = *s++;
5319 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005320#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 if (ch >= 0x10000) {
5322 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5323 ch = 0xD800 | ((ch-0x10000) >> 10);
5324 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005325#endif
Tim Peters772747b2001-08-09 22:21:55 +00005326 STORECHAR(ch);
5327 if (ch2)
5328 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005329 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005330
5331 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005332 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005333#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334}
5335
Alexander Belopolsky40018472011-02-26 01:02:56 +00005336PyObject *
5337PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338{
5339 if (!PyUnicode_Check(unicode)) {
5340 PyErr_BadArgument();
5341 return NULL;
5342 }
5343 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 PyUnicode_GET_SIZE(unicode),
5345 NULL,
5346 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347}
5348
5349/* --- Unicode Escape Codec ----------------------------------------------- */
5350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005351/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5352 if all the escapes in the string make it still a valid ASCII string.
5353 Returns -1 if any escapes were found which cause the string to
5354 pop out of ASCII range. Otherwise returns the length of the
5355 required buffer to hold the string.
5356 */
5357Py_ssize_t
5358length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5359{
5360 const unsigned char *p = (const unsigned char *)s;
5361 const unsigned char *end = p + size;
5362 Py_ssize_t length = 0;
5363
5364 if (size < 0)
5365 return -1;
5366
5367 for (; p < end; ++p) {
5368 if (*p > 127) {
5369 /* Non-ASCII */
5370 return -1;
5371 }
5372 else if (*p != '\\') {
5373 /* Normal character */
5374 ++length;
5375 }
5376 else {
5377 /* Backslash-escape, check next char */
5378 ++p;
5379 /* Escape sequence reaches till end of string or
5380 non-ASCII follow-up. */
5381 if (p >= end || *p > 127)
5382 return -1;
5383 switch (*p) {
5384 case '\n':
5385 /* backslash + \n result in zero characters */
5386 break;
5387 case '\\': case '\'': case '\"':
5388 case 'b': case 'f': case 't':
5389 case 'n': case 'r': case 'v': case 'a':
5390 ++length;
5391 break;
5392 case '0': case '1': case '2': case '3':
5393 case '4': case '5': case '6': case '7':
5394 case 'x': case 'u': case 'U': case 'N':
5395 /* these do not guarantee ASCII characters */
5396 return -1;
5397 default:
5398 /* count the backslash + the other character */
5399 length += 2;
5400 }
5401 }
5402 }
5403 return length;
5404}
5405
5406/* Similar to PyUnicode_WRITE but either write into wstr field
5407 or treat string as ASCII. */
5408#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5409 do { \
5410 if ((kind) != PyUnicode_WCHAR_KIND) \
5411 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5412 else \
5413 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5414 } while (0)
5415
5416#define WRITE_WSTR(buf, index, value) \
5417 assert(kind == PyUnicode_WCHAR_KIND), \
5418 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5419
5420
Fredrik Lundh06d12682001-01-24 07:59:11 +00005421static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005422
Alexander Belopolsky40018472011-02-26 01:02:56 +00005423PyObject *
5424PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005425 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005426 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005429 Py_ssize_t startinpos;
5430 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005433 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005435 char* message;
5436 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 PyObject *errorHandler = NULL;
5438 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 Py_ssize_t ascii_length;
5440 Py_ssize_t i;
5441 int kind;
5442 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005444 ascii_length = length_of_escaped_ascii_string(s, size);
5445
5446 /* After length_of_escaped_ascii_string() there are two alternatives,
5447 either the string is pure ASCII with named escapes like \n, etc.
5448 and we determined it's exact size (common case)
5449 or it contains \x, \u, ... escape sequences. then we create a
5450 legacy wchar string and resize it at the end of this function. */
5451 if (ascii_length >= 0) {
5452 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5453 if (!v)
5454 goto onError;
5455 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5456 kind = PyUnicode_1BYTE_KIND;
5457 data = PyUnicode_DATA(v);
5458 }
5459 else {
5460 /* Escaped strings will always be longer than the resulting
5461 Unicode string, so we start with size here and then reduce the
5462 length after conversion to the true value.
5463 (but if the error callback returns a long replacement string
5464 we'll have to allocate more space) */
5465 v = _PyUnicode_New(size);
5466 if (!v)
5467 goto onError;
5468 kind = PyUnicode_WCHAR_KIND;
5469 data = PyUnicode_AS_UNICODE(v);
5470 }
5471
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 if (size == 0)
5473 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005474 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005476
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 while (s < end) {
5478 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005479 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 if (kind == PyUnicode_WCHAR_KIND) {
5483 assert(i < _PyUnicode_WSTR_LENGTH(v));
5484 }
5485 else {
5486 /* The only case in which i == ascii_length is a backslash
5487 followed by a newline. */
5488 assert(i <= ascii_length);
5489 }
5490
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 /* Non-escape characters are interpreted as Unicode ordinals */
5492 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005493 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 continue;
5495 }
5496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 /* \ - Escapes */
5499 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005500 c = *s++;
5501 if (s > end)
5502 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005503
5504 if (kind == PyUnicode_WCHAR_KIND) {
5505 assert(i < _PyUnicode_WSTR_LENGTH(v));
5506 }
5507 else {
5508 /* The only case in which i == ascii_length is a backslash
5509 followed by a newline. */
5510 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5511 }
5512
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005513 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5518 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5519 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5520 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5521 /* FF */
5522 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5523 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5524 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5525 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5526 /* VT */
5527 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5528 /* BEL, not classic C */
5529 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 case '0': case '1': case '2': case '3':
5533 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005534 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005535 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005536 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005537 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005538 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 break;
5542
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 /* hex escapes */
5544 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546 digits = 2;
5547 message = "truncated \\xXX escape";
5548 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005552 digits = 4;
5553 message = "truncated \\uXXXX escape";
5554 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005557 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005558 digits = 8;
5559 message = "truncated \\UXXXXXXXX escape";
5560 hexescape:
5561 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 if (s+digits>end) {
5564 endinpos = size;
5565 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 errors, &errorHandler,
5567 "unicodeescape", "end of string in escape sequence",
5568 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 goto nextByte;
5573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005574 for (j = 0; j < digits; ++j) {
5575 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005576 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577 endinpos = (s+j+1)-starts;
5578 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005579 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 errors, &errorHandler,
5581 "unicodeescape", message,
5582 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005584 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005587 }
5588 chr = (chr<<4) & ~0xF;
5589 if (c >= '0' && c <= '9')
5590 chr += c - '0';
5591 else if (c >= 'a' && c <= 'f')
5592 chr += 10 + c - 'a';
5593 else
5594 chr += 10 + c - 'A';
5595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005597 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 /* _decoding_error will have already written into the
5599 target buffer. */
5600 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005601 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005602 /* when we get here, chr is a 32-bit unicode character */
5603 if (chr <= 0xffff)
5604 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005605 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005606 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005607 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005608 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005609#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005611#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005612 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5614 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005615#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005616 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 errors, &errorHandler,
5621 "unicodeescape", "illegal Unicode character",
5622 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005623 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005624 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005626 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005627 break;
5628
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005630 case 'N':
5631 message = "malformed \\N character escape";
5632 if (ucnhash_CAPI == NULL) {
5633 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005634 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5635 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005636 if (ucnhash_CAPI == NULL)
5637 goto ucnhashError;
5638 }
5639 if (*s == '{') {
5640 const char *start = s+1;
5641 /* look for the closing brace */
5642 while (*s != '}' && s < end)
5643 s++;
5644 if (s > start && s < end && *s == '}') {
5645 /* found a name. look it up in the unicode database */
5646 message = "unknown Unicode character name";
5647 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5649 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005650 goto store;
5651 }
5652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 errors, &errorHandler,
5657 "unicodeescape", message,
5658 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005660 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005661 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005662 break;
5663
5664 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005665 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005666 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 message = "\\ at end of string";
5668 s--;
5669 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 errors, &errorHandler,
5673 "unicodeescape", message,
5674 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005676 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005678 }
5679 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005680 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5681 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005682 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005683 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 /* Ensure the length prediction worked in case of ASCII strings */
5689 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5690
Victor Stinnerfe226c02011-10-03 03:52:20 +02005691 if (kind == PyUnicode_WCHAR_KIND)
5692 {
5693 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5694 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005695 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005696 Py_XDECREF(errorHandler);
5697 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005698#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005699 if (_PyUnicode_READY_REPLACE(&v)) {
5700 Py_DECREF(v);
5701 return NULL;
5702 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005703#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005704 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005706
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005708 PyErr_SetString(
5709 PyExc_UnicodeError,
5710 "\\N escapes not supported (can't load unicodedata module)"
5711 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005712 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 Py_XDECREF(errorHandler);
5714 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005715 return NULL;
5716
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719 Py_XDECREF(errorHandler);
5720 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 return NULL;
5722}
5723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005724#undef WRITE_ASCII_OR_WSTR
5725#undef WRITE_WSTR
5726
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727/* Return a Unicode-Escape string version of the Unicode object.
5728
5729 If quotes is true, the string is enclosed in u"" or u'' quotes as
5730 appropriate.
5731
5732*/
5733
Walter Dörwald79e913e2007-05-12 11:08:06 +00005734static const char *hexdigits = "0123456789abcdef";
5735
Alexander Belopolsky40018472011-02-26 01:02:56 +00005736PyObject *
5737PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005738 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005740 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005743#ifdef Py_UNICODE_WIDE
5744 const Py_ssize_t expandsize = 10;
5745#else
5746 const Py_ssize_t expandsize = 6;
5747#endif
5748
Thomas Wouters89f507f2006-12-13 04:49:30 +00005749 /* XXX(nnorwitz): rather than over-allocating, it would be
5750 better to choose a different scheme. Perhaps scan the
5751 first N-chars of the string and allocate based on that size.
5752 */
5753 /* Initial allocation is based on the longest-possible unichr
5754 escape.
5755
5756 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5757 unichr, so in this case it's the longest unichr escape. In
5758 narrow (UTF-16) builds this is five chars per source unichr
5759 since there are two unichrs in the surrogate pair, so in narrow
5760 (UTF-16) builds it's not the longest unichr escape.
5761
5762 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5763 so in the narrow (UTF-16) build case it's the longest unichr
5764 escape.
5765 */
5766
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005767 if (size == 0)
5768 return PyBytes_FromStringAndSize(NULL, 0);
5769
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005770 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005772
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005773 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 2
5775 + expandsize*size
5776 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 if (repr == NULL)
5778 return NULL;
5779
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005780 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 while (size-- > 0) {
5783 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005784
Walter Dörwald79e913e2007-05-12 11:08:06 +00005785 /* Escape backslashes */
5786 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 *p++ = '\\';
5788 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005789 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005790 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005791
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005792#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005793 /* Map 21-bit characters to '\U00xxxxxx' */
5794 else if (ch >= 0x10000) {
5795 *p++ = '\\';
5796 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005797 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5798 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5799 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5800 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5801 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5802 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5803 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5804 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005806 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005807#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5809 else if (ch >= 0xD800 && ch < 0xDC00) {
5810 Py_UNICODE ch2;
5811 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005812
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 ch2 = *s++;
5814 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005815 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5817 *p++ = '\\';
5818 *p++ = 'U';
5819 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5820 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5821 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5822 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5823 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5824 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5825 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5826 *p++ = hexdigits[ucs & 0x0000000F];
5827 continue;
5828 }
5829 /* Fall through: isolated surrogates are copied as-is */
5830 s--;
5831 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005832 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005833#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005834
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005836 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 *p++ = '\\';
5838 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005839 *p++ = hexdigits[(ch >> 12) & 0x000F];
5840 *p++ = hexdigits[(ch >> 8) & 0x000F];
5841 *p++ = hexdigits[(ch >> 4) & 0x000F];
5842 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005844
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005845 /* Map special whitespace to '\t', \n', '\r' */
5846 else if (ch == '\t') {
5847 *p++ = '\\';
5848 *p++ = 't';
5849 }
5850 else if (ch == '\n') {
5851 *p++ = '\\';
5852 *p++ = 'n';
5853 }
5854 else if (ch == '\r') {
5855 *p++ = '\\';
5856 *p++ = 'r';
5857 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005858
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005859 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005860 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005862 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005863 *p++ = hexdigits[(ch >> 4) & 0x000F];
5864 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005865 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005866
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 /* Copy everything else as-is */
5868 else
5869 *p++ = (char) ch;
5870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 assert(p - PyBytes_AS_STRING(repr) > 0);
5873 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5874 return NULL;
5875 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876}
5877
Alexander Belopolsky40018472011-02-26 01:02:56 +00005878PyObject *
5879PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005881 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 if (!PyUnicode_Check(unicode)) {
5883 PyErr_BadArgument();
5884 return NULL;
5885 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005886 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5887 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005888 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889}
5890
5891/* --- Raw Unicode Escape Codec ------------------------------------------- */
5892
Alexander Belopolsky40018472011-02-26 01:02:56 +00005893PyObject *
5894PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005895 Py_ssize_t size,
5896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005899 Py_ssize_t startinpos;
5900 Py_ssize_t endinpos;
5901 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 const char *end;
5905 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 PyObject *errorHandler = NULL;
5907 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005908
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 /* Escaped strings will always be longer than the resulting
5910 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 length after conversion to the true value. (But decoding error
5912 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 v = _PyUnicode_New(size);
5914 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 end = s + size;
5920 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 unsigned char c;
5922 Py_UCS4 x;
5923 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005924 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 /* Non-escape characters are interpreted as Unicode ordinals */
5927 if (*s != '\\') {
5928 *p++ = (unsigned char)*s++;
5929 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005930 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 startinpos = s-starts;
5932
5933 /* \u-escapes are only interpreted iff the number of leading
5934 backslashes if odd */
5935 bs = s;
5936 for (;s < end;) {
5937 if (*s != '\\')
5938 break;
5939 *p++ = (unsigned char)*s++;
5940 }
5941 if (((s - bs) & 1) == 0 ||
5942 s >= end ||
5943 (*s != 'u' && *s != 'U')) {
5944 continue;
5945 }
5946 p--;
5947 count = *s=='u' ? 4 : 8;
5948 s++;
5949
5950 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5951 outpos = p-PyUnicode_AS_UNICODE(v);
5952 for (x = 0, i = 0; i < count; ++i, ++s) {
5953 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005954 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 endinpos = s-starts;
5956 if (unicode_decode_call_errorhandler(
5957 errors, &errorHandler,
5958 "rawunicodeescape", "truncated \\uXXXX",
5959 &starts, &end, &startinpos, &endinpos, &exc, &s,
5960 &v, &outpos, &p))
5961 goto onError;
5962 goto nextByte;
5963 }
5964 x = (x<<4) & ~0xF;
5965 if (c >= '0' && c <= '9')
5966 x += c - '0';
5967 else if (c >= 'a' && c <= 'f')
5968 x += 10 + c - 'a';
5969 else
5970 x += 10 + c - 'A';
5971 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005972 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 /* UCS-2 character */
5974 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005975 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 /* UCS-4 character. Either store directly, or as
5977 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005978#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005980#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 x -= 0x10000L;
5982 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5983 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005984#endif
5985 } else {
5986 endinpos = s-starts;
5987 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005988 if (unicode_decode_call_errorhandler(
5989 errors, &errorHandler,
5990 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 &starts, &end, &startinpos, &endinpos, &exc, &s,
5992 &v, &outpos, &p))
5993 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 nextByte:
5996 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005998 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 Py_XDECREF(errorHandler);
6001 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006002#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006003 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006004 Py_DECREF(v);
6005 return NULL;
6006 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006007#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006008 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006010
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006013 Py_XDECREF(errorHandler);
6014 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 return NULL;
6016}
6017
Alexander Belopolsky40018472011-02-26 01:02:56 +00006018PyObject *
6019PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006020 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006022 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 char *p;
6024 char *q;
6025
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006026#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006027 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006028#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006029 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006030#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006031
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006032 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006034
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 if (repr == NULL)
6037 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006038 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006039 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006041 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 while (size-- > 0) {
6043 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006044#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 /* Map 32-bit characters to '\Uxxxxxxxx' */
6046 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006047 *p++ = '\\';
6048 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006049 *p++ = hexdigits[(ch >> 28) & 0xf];
6050 *p++ = hexdigits[(ch >> 24) & 0xf];
6051 *p++ = hexdigits[(ch >> 20) & 0xf];
6052 *p++ = hexdigits[(ch >> 16) & 0xf];
6053 *p++ = hexdigits[(ch >> 12) & 0xf];
6054 *p++ = hexdigits[(ch >> 8) & 0xf];
6055 *p++ = hexdigits[(ch >> 4) & 0xf];
6056 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006057 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006058 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006059#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6061 if (ch >= 0xD800 && ch < 0xDC00) {
6062 Py_UNICODE ch2;
6063 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006064
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 ch2 = *s++;
6066 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006067 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6069 *p++ = '\\';
6070 *p++ = 'U';
6071 *p++ = hexdigits[(ucs >> 28) & 0xf];
6072 *p++ = hexdigits[(ucs >> 24) & 0xf];
6073 *p++ = hexdigits[(ucs >> 20) & 0xf];
6074 *p++ = hexdigits[(ucs >> 16) & 0xf];
6075 *p++ = hexdigits[(ucs >> 12) & 0xf];
6076 *p++ = hexdigits[(ucs >> 8) & 0xf];
6077 *p++ = hexdigits[(ucs >> 4) & 0xf];
6078 *p++ = hexdigits[ucs & 0xf];
6079 continue;
6080 }
6081 /* Fall through: isolated surrogates are copied as-is */
6082 s--;
6083 size++;
6084 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006085#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 /* Map 16-bit characters to '\uxxxx' */
6087 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 *p++ = '\\';
6089 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006090 *p++ = hexdigits[(ch >> 12) & 0xf];
6091 *p++ = hexdigits[(ch >> 8) & 0xf];
6092 *p++ = hexdigits[(ch >> 4) & 0xf];
6093 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 /* Copy everything else as-is */
6096 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 *p++ = (char) ch;
6098 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006099 size = p - q;
6100
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006101 assert(size > 0);
6102 if (_PyBytes_Resize(&repr, size) < 0)
6103 return NULL;
6104 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105}
6106
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107PyObject *
6108PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006110 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006112 PyErr_BadArgument();
6113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006115 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6116 PyUnicode_GET_SIZE(unicode));
6117
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006118 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119}
6120
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121/* --- Unicode Internal Codec ------------------------------------------- */
6122
Alexander Belopolsky40018472011-02-26 01:02:56 +00006123PyObject *
6124_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006125 Py_ssize_t size,
6126 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006127{
6128 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t startinpos;
6130 Py_ssize_t endinpos;
6131 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006132 PyUnicodeObject *v;
6133 Py_UNICODE *p;
6134 const char *end;
6135 const char *reason;
6136 PyObject *errorHandler = NULL;
6137 PyObject *exc = NULL;
6138
Neal Norwitzd43069c2006-01-08 01:12:10 +00006139#ifdef Py_UNICODE_WIDE
6140 Py_UNICODE unimax = PyUnicode_GetMax();
6141#endif
6142
Thomas Wouters89f507f2006-12-13 04:49:30 +00006143 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006144 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6145 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006147 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6148 as string was created with the old API. */
6149 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006151 p = PyUnicode_AS_UNICODE(v);
6152 end = s + size;
6153
6154 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006155 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006156 /* We have to sanity check the raw data, otherwise doom looms for
6157 some malformed UCS-4 data. */
6158 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006159#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006160 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006161#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006162 end-s < Py_UNICODE_SIZE
6163 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006165 startinpos = s - starts;
6166 if (end-s < Py_UNICODE_SIZE) {
6167 endinpos = end-starts;
6168 reason = "truncated input";
6169 }
6170 else {
6171 endinpos = s - starts + Py_UNICODE_SIZE;
6172 reason = "illegal code point (> 0x10FFFF)";
6173 }
6174 outpos = p - PyUnicode_AS_UNICODE(v);
6175 if (unicode_decode_call_errorhandler(
6176 errors, &errorHandler,
6177 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006178 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006179 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006180 goto onError;
6181 }
6182 }
6183 else {
6184 p++;
6185 s += Py_UNICODE_SIZE;
6186 }
6187 }
6188
Victor Stinnerfe226c02011-10-03 03:52:20 +02006189 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006190 goto onError;
6191 Py_XDECREF(errorHandler);
6192 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006193#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006194 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006195 Py_DECREF(v);
6196 return NULL;
6197 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006198#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006199 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006200 return (PyObject *)v;
6201
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006203 Py_XDECREF(v);
6204 Py_XDECREF(errorHandler);
6205 Py_XDECREF(exc);
6206 return NULL;
6207}
6208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209/* --- Latin-1 Codec ------------------------------------------------------ */
6210
Alexander Belopolsky40018472011-02-26 01:02:56 +00006211PyObject *
6212PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006213 Py_ssize_t size,
6214 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006217 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006221static void
6222make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006223 const char *encoding,
6224 const Py_UNICODE *unicode, Py_ssize_t size,
6225 Py_ssize_t startpos, Py_ssize_t endpos,
6226 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 *exceptionObject = PyUnicodeEncodeError_Create(
6230 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 }
6232 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6234 goto onError;
6235 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6236 goto onError;
6237 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6238 goto onError;
6239 return;
6240 onError:
6241 Py_DECREF(*exceptionObject);
6242 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 }
6244}
6245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006247static void
6248raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006249 const char *encoding,
6250 const Py_UNICODE *unicode, Py_ssize_t size,
6251 Py_ssize_t startpos, Py_ssize_t endpos,
6252 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253{
6254 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258}
6259
6260/* error handling callback helper:
6261 build arguments, call the callback and check the arguments,
6262 put the result into newpos and return the replacement string, which
6263 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006264static PyObject *
6265unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006266 PyObject **errorHandler,
6267 const char *encoding, const char *reason,
6268 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6269 Py_ssize_t startpos, Py_ssize_t endpos,
6270 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006272 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006273
6274 PyObject *restuple;
6275 PyObject *resunicode;
6276
6277 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 }
6282
6283 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287
6288 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006290 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006293 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 Py_DECREF(restuple);
6295 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006297 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 &resunicode, newpos)) {
6299 Py_DECREF(restuple);
6300 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006302 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6303 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6304 Py_DECREF(restuple);
6305 return NULL;
6306 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006309 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6311 Py_DECREF(restuple);
6312 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006313 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006314 Py_INCREF(resunicode);
6315 Py_DECREF(restuple);
6316 return resunicode;
6317}
6318
Alexander Belopolsky40018472011-02-26 01:02:56 +00006319static PyObject *
6320unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006321 Py_ssize_t size,
6322 const char *errors,
6323 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324{
6325 /* output object */
6326 PyObject *res;
6327 /* pointers to the beginning and end+1 of input */
6328 const Py_UNICODE *startp = p;
6329 const Py_UNICODE *endp = p + size;
6330 /* pointer to the beginning of the unencodable characters */
6331 /* const Py_UNICODE *badp = NULL; */
6332 /* pointer into the output */
6333 char *str;
6334 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006335 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006336 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6337 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 PyObject *errorHandler = NULL;
6339 PyObject *exc = NULL;
6340 /* the following variable is used for caching string comparisons
6341 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6342 int known_errorHandler = -1;
6343
6344 /* allocate enough for a simple encoding without
6345 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006346 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006347 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006348 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006349 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006350 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006351 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352 ressize = size;
6353
6354 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 /* can we encode this? */
6358 if (c<limit) {
6359 /* no overflow check, because we know that the space is enough */
6360 *str++ = (char)c;
6361 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006362 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 else {
6364 Py_ssize_t unicodepos = p-startp;
6365 Py_ssize_t requiredsize;
6366 PyObject *repunicode;
6367 Py_ssize_t repsize;
6368 Py_ssize_t newpos;
6369 Py_ssize_t respos;
6370 Py_UNICODE *uni2;
6371 /* startpos for collecting unencodable chars */
6372 const Py_UNICODE *collstart = p;
6373 const Py_UNICODE *collend = p;
6374 /* find all unecodable characters */
6375 while ((collend < endp) && ((*collend)>=limit))
6376 ++collend;
6377 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6378 if (known_errorHandler==-1) {
6379 if ((errors==NULL) || (!strcmp(errors, "strict")))
6380 known_errorHandler = 1;
6381 else if (!strcmp(errors, "replace"))
6382 known_errorHandler = 2;
6383 else if (!strcmp(errors, "ignore"))
6384 known_errorHandler = 3;
6385 else if (!strcmp(errors, "xmlcharrefreplace"))
6386 known_errorHandler = 4;
6387 else
6388 known_errorHandler = 0;
6389 }
6390 switch (known_errorHandler) {
6391 case 1: /* strict */
6392 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6393 goto onError;
6394 case 2: /* replace */
6395 while (collstart++<collend)
6396 *str++ = '?'; /* fall through */
6397 case 3: /* ignore */
6398 p = collend;
6399 break;
6400 case 4: /* xmlcharrefreplace */
6401 respos = str - PyBytes_AS_STRING(res);
6402 /* determine replacement size (temporarily (mis)uses p) */
6403 for (p = collstart, repsize = 0; p < collend; ++p) {
6404 if (*p<10)
6405 repsize += 2+1+1;
6406 else if (*p<100)
6407 repsize += 2+2+1;
6408 else if (*p<1000)
6409 repsize += 2+3+1;
6410 else if (*p<10000)
6411 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006412#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 else
6414 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006415#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 else if (*p<100000)
6417 repsize += 2+5+1;
6418 else if (*p<1000000)
6419 repsize += 2+6+1;
6420 else
6421 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006422#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 }
6424 requiredsize = respos+repsize+(endp-collend);
6425 if (requiredsize > ressize) {
6426 if (requiredsize<2*ressize)
6427 requiredsize = 2*ressize;
6428 if (_PyBytes_Resize(&res, requiredsize))
6429 goto onError;
6430 str = PyBytes_AS_STRING(res) + respos;
6431 ressize = requiredsize;
6432 }
6433 /* generate replacement (temporarily (mis)uses p) */
6434 for (p = collstart; p < collend; ++p) {
6435 str += sprintf(str, "&#%d;", (int)*p);
6436 }
6437 p = collend;
6438 break;
6439 default:
6440 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6441 encoding, reason, startp, size, &exc,
6442 collstart-startp, collend-startp, &newpos);
6443 if (repunicode == NULL)
6444 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006445 if (PyBytes_Check(repunicode)) {
6446 /* Directly copy bytes result to output. */
6447 repsize = PyBytes_Size(repunicode);
6448 if (repsize > 1) {
6449 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006450 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006451 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6452 Py_DECREF(repunicode);
6453 goto onError;
6454 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006455 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006456 ressize += repsize-1;
6457 }
6458 memcpy(str, PyBytes_AsString(repunicode), repsize);
6459 str += repsize;
6460 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006461 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006462 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006463 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* need more space? (at least enough for what we
6465 have+the replacement+the rest of the string, so
6466 we won't have to check space for encodable characters) */
6467 respos = str - PyBytes_AS_STRING(res);
6468 repsize = PyUnicode_GET_SIZE(repunicode);
6469 requiredsize = respos+repsize+(endp-collend);
6470 if (requiredsize > ressize) {
6471 if (requiredsize<2*ressize)
6472 requiredsize = 2*ressize;
6473 if (_PyBytes_Resize(&res, requiredsize)) {
6474 Py_DECREF(repunicode);
6475 goto onError;
6476 }
6477 str = PyBytes_AS_STRING(res) + respos;
6478 ressize = requiredsize;
6479 }
6480 /* check if there is anything unencodable in the replacement
6481 and copy it to the output */
6482 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6483 c = *uni2;
6484 if (c >= limit) {
6485 raise_encode_exception(&exc, encoding, startp, size,
6486 unicodepos, unicodepos+1, reason);
6487 Py_DECREF(repunicode);
6488 goto onError;
6489 }
6490 *str = (char)c;
6491 }
6492 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006493 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006495 }
6496 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006497 /* Resize if we allocated to much */
6498 size = str - PyBytes_AS_STRING(res);
6499 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006500 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006501 if (_PyBytes_Resize(&res, size) < 0)
6502 goto onError;
6503 }
6504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006505 Py_XDECREF(errorHandler);
6506 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006507 return res;
6508
6509 onError:
6510 Py_XDECREF(res);
6511 Py_XDECREF(errorHandler);
6512 Py_XDECREF(exc);
6513 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514}
6515
Alexander Belopolsky40018472011-02-26 01:02:56 +00006516PyObject *
6517PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006518 Py_ssize_t size,
6519 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006521 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522}
6523
Alexander Belopolsky40018472011-02-26 01:02:56 +00006524PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006525_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526{
6527 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 PyErr_BadArgument();
6529 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006531 if (PyUnicode_READY(unicode) == -1)
6532 return NULL;
6533 /* Fast path: if it is a one-byte string, construct
6534 bytes object directly. */
6535 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6536 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6537 PyUnicode_GET_LENGTH(unicode));
6538 /* Non-Latin-1 characters present. Defer to above function to
6539 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006542 errors);
6543}
6544
6545PyObject*
6546PyUnicode_AsLatin1String(PyObject *unicode)
6547{
6548 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549}
6550
6551/* --- 7-bit ASCII Codec -------------------------------------------------- */
6552
Alexander Belopolsky40018472011-02-26 01:02:56 +00006553PyObject *
6554PyUnicode_DecodeASCII(const char *s,
6555 Py_ssize_t size,
6556 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006560 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006561 Py_ssize_t startinpos;
6562 Py_ssize_t endinpos;
6563 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006565 int has_error;
6566 const unsigned char *p = (const unsigned char *)s;
6567 const unsigned char *end = p + size;
6568 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 PyObject *errorHandler = NULL;
6570 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006571
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006573 if (size == 1 && (unsigned char)s[0] < 128)
6574 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006575
Victor Stinner702c7342011-10-05 13:50:52 +02006576 has_error = 0;
6577 while (p < end && !has_error) {
6578 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6579 an explanation. */
6580 if (!((size_t) p & LONG_PTR_MASK)) {
6581 /* Help register allocation */
6582 register const unsigned char *_p = p;
6583 while (_p < aligned_end) {
6584 unsigned long value = *(unsigned long *) _p;
6585 if (value & ASCII_CHAR_MASK) {
6586 has_error = 1;
6587 break;
6588 }
6589 _p += SIZEOF_LONG;
6590 }
6591 if (_p == end)
6592 break;
6593 if (has_error)
6594 break;
6595 p = _p;
6596 }
6597 if (*p & 0x80) {
6598 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006599 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006600 }
6601 else {
6602 ++p;
6603 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006604 }
Victor Stinner702c7342011-10-05 13:50:52 +02006605 if (!has_error)
6606 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 v = _PyUnicode_New(size);
6609 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006613 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 e = s + size;
6615 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 register unsigned char c = (unsigned char)*s;
6617 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006618 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 ++s;
6620 }
6621 else {
6622 startinpos = s-starts;
6623 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006624 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 if (unicode_decode_call_errorhandler(
6626 errors, &errorHandler,
6627 "ascii", "ordinal not in range(128)",
6628 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006629 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 goto onError;
6631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 }
Victor Stinner702c7342011-10-05 13:50:52 +02006633 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6634 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636 Py_XDECREF(errorHandler);
6637 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006638#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006639 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006640 Py_DECREF(v);
6641 return NULL;
6642 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006643#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006644 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006646
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649 Py_XDECREF(errorHandler);
6650 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 return NULL;
6652}
6653
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654PyObject *
6655PyUnicode_EncodeASCII(const Py_UNICODE *p,
6656 Py_ssize_t size,
6657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660}
6661
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006663_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664{
6665 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 PyErr_BadArgument();
6667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669 if (PyUnicode_READY(unicode) == -1)
6670 return NULL;
6671 /* Fast path: if it is an ASCII-only string, construct bytes object
6672 directly. Else defer to above function to raise the exception. */
6673 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6674 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6675 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006678 errors);
6679}
6680
6681PyObject *
6682PyUnicode_AsASCIIString(PyObject *unicode)
6683{
6684 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685}
6686
Victor Stinner99b95382011-07-04 14:23:54 +02006687#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006688
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006689/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006690
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006691#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692#define NEED_RETRY
6693#endif
6694
6695/* XXX This code is limited to "true" double-byte encodings, as
6696 a) it assumes an incomplete character consists of a single byte, and
6697 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006699
Alexander Belopolsky40018472011-02-26 01:02:56 +00006700static int
6701is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702{
6703 const char *curr = s + offset;
6704
6705 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 const char *prev = CharPrev(s, curr);
6707 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708 }
6709 return 0;
6710}
6711
6712/*
6713 * Decode MBCS string into unicode object. If 'final' is set, converts
6714 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6715 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716static int
6717decode_mbcs(PyUnicodeObject **v,
6718 const char *s, /* MBCS string */
6719 int size, /* sizeof MBCS string */
6720 int final,
6721 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006722{
6723 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006724 Py_ssize_t n;
6725 DWORD usize;
6726 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006727
6728 assert(size >= 0);
6729
Victor Stinner554f3f02010-06-16 23:33:54 +00006730 /* check and handle 'errors' arg */
6731 if (errors==NULL || strcmp(errors, "strict")==0)
6732 flags = MB_ERR_INVALID_CHARS;
6733 else if (strcmp(errors, "ignore")==0)
6734 flags = 0;
6735 else {
6736 PyErr_Format(PyExc_ValueError,
6737 "mbcs encoding does not support errors='%s'",
6738 errors);
6739 return -1;
6740 }
6741
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006742 /* Skip trailing lead-byte unless 'final' is set */
6743 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006745
6746 /* First get the size of the result */
6747 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006748 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6749 if (usize==0)
6750 goto mbcs_decode_error;
6751 } else
6752 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006753
6754 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 /* Create unicode object */
6756 *v = _PyUnicode_New(usize);
6757 if (*v == NULL)
6758 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006759 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006760 }
6761 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 /* Extend unicode object */
6763 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006764 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006766 }
6767
6768 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006769 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006771 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6772 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006775 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006776
6777mbcs_decode_error:
6778 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6779 we raise a UnicodeDecodeError - else it is a 'generic'
6780 windows error
6781 */
6782 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6783 /* Ideally, we should get reason from FormatMessage - this
6784 is the Windows 2000 English version of the message
6785 */
6786 PyObject *exc = NULL;
6787 const char *reason = "No mapping for the Unicode character exists "
6788 "in the target multi-byte code page.";
6789 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6790 if (exc != NULL) {
6791 PyCodec_StrictErrors(exc);
6792 Py_DECREF(exc);
6793 }
6794 } else {
6795 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6796 }
6797 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798}
6799
Alexander Belopolsky40018472011-02-26 01:02:56 +00006800PyObject *
6801PyUnicode_DecodeMBCSStateful(const char *s,
6802 Py_ssize_t size,
6803 const char *errors,
6804 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006805{
6806 PyUnicodeObject *v = NULL;
6807 int done;
6808
6809 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811
6812#ifdef NEED_RETRY
6813 retry:
6814 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006815 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816 else
6817#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006818 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
6820 if (done < 0) {
6821 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006823 }
6824
6825 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827
6828#ifdef NEED_RETRY
6829 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 s += done;
6831 size -= done;
6832 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006833 }
6834#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006835#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006836 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006837 Py_DECREF(v);
6838 return NULL;
6839 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006840#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006841 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842 return (PyObject *)v;
6843}
6844
Alexander Belopolsky40018472011-02-26 01:02:56 +00006845PyObject *
6846PyUnicode_DecodeMBCS(const char *s,
6847 Py_ssize_t size,
6848 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006849{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6851}
6852
6853/*
6854 * Convert unicode into string object (MBCS).
6855 * Returns 0 if succeed, -1 otherwise.
6856 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006857static int
6858encode_mbcs(PyObject **repr,
6859 const Py_UNICODE *p, /* unicode */
6860 int size, /* size of unicode */
6861 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862{
Victor Stinner554f3f02010-06-16 23:33:54 +00006863 BOOL usedDefaultChar = FALSE;
6864 BOOL *pusedDefaultChar;
6865 int mbcssize;
6866 Py_ssize_t n;
6867 PyObject *exc = NULL;
6868 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869
6870 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006871
Victor Stinner554f3f02010-06-16 23:33:54 +00006872 /* check and handle 'errors' arg */
6873 if (errors==NULL || strcmp(errors, "strict")==0) {
6874 flags = WC_NO_BEST_FIT_CHARS;
6875 pusedDefaultChar = &usedDefaultChar;
6876 } else if (strcmp(errors, "replace")==0) {
6877 flags = 0;
6878 pusedDefaultChar = NULL;
6879 } else {
6880 PyErr_Format(PyExc_ValueError,
6881 "mbcs encoding does not support errors='%s'",
6882 errors);
6883 return -1;
6884 }
6885
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006886 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006888 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6889 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 if (mbcssize == 0) {
6891 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6892 return -1;
6893 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006894 /* If we used a default char, then we failed! */
6895 if (pusedDefaultChar && *pusedDefaultChar)
6896 goto mbcs_encode_error;
6897 } else {
6898 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006899 }
6900
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 /* Create string object */
6903 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6904 if (*repr == NULL)
6905 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006906 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907 }
6908 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 /* Extend string object */
6910 n = PyBytes_Size(*repr);
6911 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6912 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913 }
6914
6915 /* Do the conversion */
6916 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006918 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6919 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6921 return -1;
6922 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006923 if (pusedDefaultChar && *pusedDefaultChar)
6924 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006926 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006927
6928mbcs_encode_error:
6929 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6930 Py_XDECREF(exc);
6931 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006932}
6933
Alexander Belopolsky40018472011-02-26 01:02:56 +00006934PyObject *
6935PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6936 Py_ssize_t size,
6937 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006938{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006939 PyObject *repr = NULL;
6940 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006941
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006945 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006946 else
6947#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006948 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006949
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 Py_XDECREF(repr);
6952 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006953 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954
6955#ifdef NEED_RETRY
6956 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 p += INT_MAX;
6958 size -= INT_MAX;
6959 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006960 }
6961#endif
6962
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006963 return repr;
6964}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006965
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyObject *
6967PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006968{
6969 if (!PyUnicode_Check(unicode)) {
6970 PyErr_BadArgument();
6971 return NULL;
6972 }
6973 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 PyUnicode_GET_SIZE(unicode),
6975 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006976}
6977
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006978#undef NEED_RETRY
6979
Victor Stinner99b95382011-07-04 14:23:54 +02006980#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982/* --- Character Mapping Codec -------------------------------------------- */
6983
Alexander Belopolsky40018472011-02-26 01:02:56 +00006984PyObject *
6985PyUnicode_DecodeCharmap(const char *s,
6986 Py_ssize_t size,
6987 PyObject *mapping,
6988 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006990 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006991 Py_ssize_t startinpos;
6992 Py_ssize_t endinpos;
6993 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 PyUnicodeObject *v;
6996 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006997 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 PyObject *errorHandler = NULL;
6999 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007000 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007001 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007002
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 /* Default to Latin-1 */
7004 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
7007 v = _PyUnicode_New(size);
7008 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007014 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 mapstring = PyUnicode_AS_UNICODE(mapping);
7016 maplen = PyUnicode_GET_SIZE(mapping);
7017 while (s < e) {
7018 unsigned char ch = *s;
7019 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 if (ch < maplen)
7022 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 if (x == 0xfffe) {
7025 /* undefined mapping */
7026 outpos = p-PyUnicode_AS_UNICODE(v);
7027 startinpos = s-starts;
7028 endinpos = startinpos+1;
7029 if (unicode_decode_call_errorhandler(
7030 errors, &errorHandler,
7031 "charmap", "character maps to <undefined>",
7032 &starts, &e, &startinpos, &endinpos, &exc, &s,
7033 &v, &outpos, &p)) {
7034 goto onError;
7035 }
7036 continue;
7037 }
7038 *p++ = x;
7039 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007040 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007041 }
7042 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 while (s < e) {
7044 unsigned char ch = *s;
7045 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007046
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7048 w = PyLong_FromLong((long)ch);
7049 if (w == NULL)
7050 goto onError;
7051 x = PyObject_GetItem(mapping, w);
7052 Py_DECREF(w);
7053 if (x == NULL) {
7054 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7055 /* No mapping found means: mapping is undefined. */
7056 PyErr_Clear();
7057 x = Py_None;
7058 Py_INCREF(x);
7059 } else
7060 goto onError;
7061 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007062
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 /* Apply mapping */
7064 if (PyLong_Check(x)) {
7065 long value = PyLong_AS_LONG(x);
7066 if (value < 0 || value > 65535) {
7067 PyErr_SetString(PyExc_TypeError,
7068 "character mapping must be in range(65536)");
7069 Py_DECREF(x);
7070 goto onError;
7071 }
7072 *p++ = (Py_UNICODE)value;
7073 }
7074 else if (x == Py_None) {
7075 /* undefined mapping */
7076 outpos = p-PyUnicode_AS_UNICODE(v);
7077 startinpos = s-starts;
7078 endinpos = startinpos+1;
7079 if (unicode_decode_call_errorhandler(
7080 errors, &errorHandler,
7081 "charmap", "character maps to <undefined>",
7082 &starts, &e, &startinpos, &endinpos, &exc, &s,
7083 &v, &outpos, &p)) {
7084 Py_DECREF(x);
7085 goto onError;
7086 }
7087 Py_DECREF(x);
7088 continue;
7089 }
7090 else if (PyUnicode_Check(x)) {
7091 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007092
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 if (targetsize == 1)
7094 /* 1-1 mapping */
7095 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007096
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 else if (targetsize > 1) {
7098 /* 1-n mapping */
7099 if (targetsize > extrachars) {
7100 /* resize first */
7101 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7102 Py_ssize_t needed = (targetsize - extrachars) + \
7103 (targetsize << 2);
7104 extrachars += needed;
7105 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007106 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 PyUnicode_GET_SIZE(v) + needed) < 0) {
7108 Py_DECREF(x);
7109 goto onError;
7110 }
7111 p = PyUnicode_AS_UNICODE(v) + oldpos;
7112 }
7113 Py_UNICODE_COPY(p,
7114 PyUnicode_AS_UNICODE(x),
7115 targetsize);
7116 p += targetsize;
7117 extrachars -= targetsize;
7118 }
7119 /* 1-0 mapping: skip the character */
7120 }
7121 else {
7122 /* wrong return value */
7123 PyErr_SetString(PyExc_TypeError,
7124 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007125 Py_DECREF(x);
7126 goto onError;
7127 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 Py_DECREF(x);
7129 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 }
7132 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007133 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135 Py_XDECREF(errorHandler);
7136 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007137#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007138 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007139 Py_DECREF(v);
7140 return NULL;
7141 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007142#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007143 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007145
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007147 Py_XDECREF(errorHandler);
7148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 Py_XDECREF(v);
7150 return NULL;
7151}
7152
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007153/* Charmap encoding: the lookup table */
7154
Alexander Belopolsky40018472011-02-26 01:02:56 +00007155struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 PyObject_HEAD
7157 unsigned char level1[32];
7158 int count2, count3;
7159 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007160};
7161
7162static PyObject*
7163encoding_map_size(PyObject *obj, PyObject* args)
7164{
7165 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007166 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007168}
7169
7170static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007171 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 PyDoc_STR("Return the size (in bytes) of this object") },
7173 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007174};
7175
7176static void
7177encoding_map_dealloc(PyObject* o)
7178{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007179 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007180}
7181
7182static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007183 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 "EncodingMap", /*tp_name*/
7185 sizeof(struct encoding_map), /*tp_basicsize*/
7186 0, /*tp_itemsize*/
7187 /* methods */
7188 encoding_map_dealloc, /*tp_dealloc*/
7189 0, /*tp_print*/
7190 0, /*tp_getattr*/
7191 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007192 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 0, /*tp_repr*/
7194 0, /*tp_as_number*/
7195 0, /*tp_as_sequence*/
7196 0, /*tp_as_mapping*/
7197 0, /*tp_hash*/
7198 0, /*tp_call*/
7199 0, /*tp_str*/
7200 0, /*tp_getattro*/
7201 0, /*tp_setattro*/
7202 0, /*tp_as_buffer*/
7203 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7204 0, /*tp_doc*/
7205 0, /*tp_traverse*/
7206 0, /*tp_clear*/
7207 0, /*tp_richcompare*/
7208 0, /*tp_weaklistoffset*/
7209 0, /*tp_iter*/
7210 0, /*tp_iternext*/
7211 encoding_map_methods, /*tp_methods*/
7212 0, /*tp_members*/
7213 0, /*tp_getset*/
7214 0, /*tp_base*/
7215 0, /*tp_dict*/
7216 0, /*tp_descr_get*/
7217 0, /*tp_descr_set*/
7218 0, /*tp_dictoffset*/
7219 0, /*tp_init*/
7220 0, /*tp_alloc*/
7221 0, /*tp_new*/
7222 0, /*tp_free*/
7223 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007224};
7225
7226PyObject*
7227PyUnicode_BuildEncodingMap(PyObject* string)
7228{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007229 PyObject *result;
7230 struct encoding_map *mresult;
7231 int i;
7232 int need_dict = 0;
7233 unsigned char level1[32];
7234 unsigned char level2[512];
7235 unsigned char *mlevel1, *mlevel2, *mlevel3;
7236 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007237 int kind;
7238 void *data;
7239 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007241 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007242 PyErr_BadArgument();
7243 return NULL;
7244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007245 kind = PyUnicode_KIND(string);
7246 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007247 memset(level1, 0xFF, sizeof level1);
7248 memset(level2, 0xFF, sizeof level2);
7249
7250 /* If there isn't a one-to-one mapping of NULL to \0,
7251 or if there are non-BMP characters, we need to use
7252 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007253 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007254 need_dict = 1;
7255 for (i = 1; i < 256; i++) {
7256 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007257 ch = PyUnicode_READ(kind, data, i);
7258 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007259 need_dict = 1;
7260 break;
7261 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007262 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007263 /* unmapped character */
7264 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 l1 = ch >> 11;
7266 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007267 if (level1[l1] == 0xFF)
7268 level1[l1] = count2++;
7269 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007270 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007271 }
7272
7273 if (count2 >= 0xFF || count3 >= 0xFF)
7274 need_dict = 1;
7275
7276 if (need_dict) {
7277 PyObject *result = PyDict_New();
7278 PyObject *key, *value;
7279 if (!result)
7280 return NULL;
7281 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007282 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007283 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007284 if (!key || !value)
7285 goto failed1;
7286 if (PyDict_SetItem(result, key, value) == -1)
7287 goto failed1;
7288 Py_DECREF(key);
7289 Py_DECREF(value);
7290 }
7291 return result;
7292 failed1:
7293 Py_XDECREF(key);
7294 Py_XDECREF(value);
7295 Py_DECREF(result);
7296 return NULL;
7297 }
7298
7299 /* Create a three-level trie */
7300 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7301 16*count2 + 128*count3 - 1);
7302 if (!result)
7303 return PyErr_NoMemory();
7304 PyObject_Init(result, &EncodingMapType);
7305 mresult = (struct encoding_map*)result;
7306 mresult->count2 = count2;
7307 mresult->count3 = count3;
7308 mlevel1 = mresult->level1;
7309 mlevel2 = mresult->level23;
7310 mlevel3 = mresult->level23 + 16*count2;
7311 memcpy(mlevel1, level1, 32);
7312 memset(mlevel2, 0xFF, 16*count2);
7313 memset(mlevel3, 0, 128*count3);
7314 count3 = 0;
7315 for (i = 1; i < 256; i++) {
7316 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007317 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007318 /* unmapped character */
7319 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320 o1 = PyUnicode_READ(kind, data, i)>>11;
7321 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007322 i2 = 16*mlevel1[o1] + o2;
7323 if (mlevel2[i2] == 0xFF)
7324 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007325 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007326 i3 = 128*mlevel2[i2] + o3;
7327 mlevel3[i3] = i;
7328 }
7329 return result;
7330}
7331
7332static int
7333encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7334{
7335 struct encoding_map *map = (struct encoding_map*)mapping;
7336 int l1 = c>>11;
7337 int l2 = (c>>7) & 0xF;
7338 int l3 = c & 0x7F;
7339 int i;
7340
7341#ifdef Py_UNICODE_WIDE
7342 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007344 }
7345#endif
7346 if (c == 0)
7347 return 0;
7348 /* level 1*/
7349 i = map->level1[l1];
7350 if (i == 0xFF) {
7351 return -1;
7352 }
7353 /* level 2*/
7354 i = map->level23[16*i+l2];
7355 if (i == 0xFF) {
7356 return -1;
7357 }
7358 /* level 3 */
7359 i = map->level23[16*map->count2 + 128*i + l3];
7360 if (i == 0) {
7361 return -1;
7362 }
7363 return i;
7364}
7365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007366/* Lookup the character ch in the mapping. If the character
7367 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007368 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007369static PyObject *
7370charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371{
Christian Heimes217cfd12007-12-02 14:31:20 +00007372 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373 PyObject *x;
7374
7375 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007377 x = PyObject_GetItem(mapping, w);
7378 Py_DECREF(w);
7379 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7381 /* No mapping found means: mapping is undefined. */
7382 PyErr_Clear();
7383 x = Py_None;
7384 Py_INCREF(x);
7385 return x;
7386 } else
7387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007389 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007391 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 long value = PyLong_AS_LONG(x);
7393 if (value < 0 || value > 255) {
7394 PyErr_SetString(PyExc_TypeError,
7395 "character mapping must be in range(256)");
7396 Py_DECREF(x);
7397 return NULL;
7398 }
7399 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007401 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 /* wrong return value */
7405 PyErr_Format(PyExc_TypeError,
7406 "character mapping must return integer, bytes or None, not %.400s",
7407 x->ob_type->tp_name);
7408 Py_DECREF(x);
7409 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 }
7411}
7412
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007413static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007414charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007415{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007416 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7417 /* exponentially overallocate to minimize reallocations */
7418 if (requiredsize < 2*outsize)
7419 requiredsize = 2*outsize;
7420 if (_PyBytes_Resize(outobj, requiredsize))
7421 return -1;
7422 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007423}
7424
Benjamin Peterson14339b62009-01-31 16:36:08 +00007425typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007427} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007428/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007429 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007430 space is available. Return a new reference to the object that
7431 was put in the output buffer, or Py_None, if the mapping was undefined
7432 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007433 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007434static charmapencode_result
7435charmapencode_output(Py_UNICODE c, PyObject *mapping,
7436 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007437{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007438 PyObject *rep;
7439 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007440 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007441
Christian Heimes90aa7642007-12-19 02:45:37 +00007442 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007443 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007445 if (res == -1)
7446 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 if (outsize<requiredsize)
7448 if (charmapencode_resize(outobj, outpos, requiredsize))
7449 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007450 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 outstart[(*outpos)++] = (char)res;
7452 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007453 }
7454
7455 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007456 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007458 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 Py_DECREF(rep);
7460 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007461 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 if (PyLong_Check(rep)) {
7463 Py_ssize_t requiredsize = *outpos+1;
7464 if (outsize<requiredsize)
7465 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7466 Py_DECREF(rep);
7467 return enc_EXCEPTION;
7468 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007469 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007471 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 else {
7473 const char *repchars = PyBytes_AS_STRING(rep);
7474 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7475 Py_ssize_t requiredsize = *outpos+repsize;
7476 if (outsize<requiredsize)
7477 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7478 Py_DECREF(rep);
7479 return enc_EXCEPTION;
7480 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007481 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 memcpy(outstart + *outpos, repchars, repsize);
7483 *outpos += repsize;
7484 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007485 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007486 Py_DECREF(rep);
7487 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007488}
7489
7490/* handle an error in PyUnicode_EncodeCharmap
7491 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007492static int
7493charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007494 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007495 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007496 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007497 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007498{
7499 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007500 Py_ssize_t repsize;
7501 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007502 Py_UNICODE *uni2;
7503 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007504 Py_ssize_t collstartpos = *inpos;
7505 Py_ssize_t collendpos = *inpos+1;
7506 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007507 char *encoding = "charmap";
7508 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007509 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007511 /* find all unencodable characters */
7512 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007513 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007514 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 int res = encoding_map_lookup(p[collendpos], mapping);
7516 if (res != -1)
7517 break;
7518 ++collendpos;
7519 continue;
7520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007521
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 rep = charmapencode_lookup(p[collendpos], mapping);
7523 if (rep==NULL)
7524 return -1;
7525 else if (rep!=Py_None) {
7526 Py_DECREF(rep);
7527 break;
7528 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007529 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007531 }
7532 /* cache callback name lookup
7533 * (if not done yet, i.e. it's the first error) */
7534 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 if ((errors==NULL) || (!strcmp(errors, "strict")))
7536 *known_errorHandler = 1;
7537 else if (!strcmp(errors, "replace"))
7538 *known_errorHandler = 2;
7539 else if (!strcmp(errors, "ignore"))
7540 *known_errorHandler = 3;
7541 else if (!strcmp(errors, "xmlcharrefreplace"))
7542 *known_errorHandler = 4;
7543 else
7544 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007545 }
7546 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007547 case 1: /* strict */
7548 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7549 return -1;
7550 case 2: /* replace */
7551 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 x = charmapencode_output('?', mapping, res, respos);
7553 if (x==enc_EXCEPTION) {
7554 return -1;
7555 }
7556 else if (x==enc_FAILED) {
7557 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7558 return -1;
7559 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007560 }
7561 /* fall through */
7562 case 3: /* ignore */
7563 *inpos = collendpos;
7564 break;
7565 case 4: /* xmlcharrefreplace */
7566 /* generate replacement (temporarily (mis)uses p) */
7567 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 char buffer[2+29+1+1];
7569 char *cp;
7570 sprintf(buffer, "&#%d;", (int)p[collpos]);
7571 for (cp = buffer; *cp; ++cp) {
7572 x = charmapencode_output(*cp, mapping, res, respos);
7573 if (x==enc_EXCEPTION)
7574 return -1;
7575 else if (x==enc_FAILED) {
7576 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7577 return -1;
7578 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007579 }
7580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007581 *inpos = collendpos;
7582 break;
7583 default:
7584 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 encoding, reason, p, size, exceptionObject,
7586 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007587 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007589 if (PyBytes_Check(repunicode)) {
7590 /* Directly copy bytes result to output. */
7591 Py_ssize_t outsize = PyBytes_Size(*res);
7592 Py_ssize_t requiredsize;
7593 repsize = PyBytes_Size(repunicode);
7594 requiredsize = *respos + repsize;
7595 if (requiredsize > outsize)
7596 /* Make room for all additional bytes. */
7597 if (charmapencode_resize(res, respos, requiredsize)) {
7598 Py_DECREF(repunicode);
7599 return -1;
7600 }
7601 memcpy(PyBytes_AsString(*res) + *respos,
7602 PyBytes_AsString(repunicode), repsize);
7603 *respos += repsize;
7604 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007605 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007606 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007607 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007608 /* generate replacement */
7609 repsize = PyUnicode_GET_SIZE(repunicode);
7610 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 x = charmapencode_output(*uni2, mapping, res, respos);
7612 if (x==enc_EXCEPTION) {
7613 return -1;
7614 }
7615 else if (x==enc_FAILED) {
7616 Py_DECREF(repunicode);
7617 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7618 return -1;
7619 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 }
7621 *inpos = newpos;
7622 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007623 }
7624 return 0;
7625}
7626
Alexander Belopolsky40018472011-02-26 01:02:56 +00007627PyObject *
7628PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7629 Py_ssize_t size,
7630 PyObject *mapping,
7631 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 /* output object */
7634 PyObject *res = NULL;
7635 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007638 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 PyObject *errorHandler = NULL;
7640 PyObject *exc = NULL;
7641 /* the following variable is used for caching string comparisons
7642 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7643 * 3=ignore, 4=xmlcharrefreplace */
7644 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645
7646 /* Default to Latin-1 */
7647 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 /* allocate enough for a simple encoding without
7651 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007652 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007653 if (res == NULL)
7654 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007655 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 /* try to encode it */
7660 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7661 if (x==enc_EXCEPTION) /* error */
7662 goto onError;
7663 if (x==enc_FAILED) { /* unencodable character */
7664 if (charmap_encoding_error(p, size, &inpos, mapping,
7665 &exc,
7666 &known_errorHandler, &errorHandler, errors,
7667 &res, &respos)) {
7668 goto onError;
7669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007670 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 else
7672 /* done with this character => adjust input position */
7673 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007676 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007677 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007678 if (_PyBytes_Resize(&res, respos) < 0)
7679 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007680
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007681 Py_XDECREF(exc);
7682 Py_XDECREF(errorHandler);
7683 return res;
7684
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007686 Py_XDECREF(res);
7687 Py_XDECREF(exc);
7688 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689 return NULL;
7690}
7691
Alexander Belopolsky40018472011-02-26 01:02:56 +00007692PyObject *
7693PyUnicode_AsCharmapString(PyObject *unicode,
7694 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695{
7696 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 PyErr_BadArgument();
7698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 }
7700 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 PyUnicode_GET_SIZE(unicode),
7702 mapping,
7703 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704}
7705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007707static void
7708make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007709 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007710 Py_ssize_t startpos, Py_ssize_t endpos,
7711 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007714 *exceptionObject = _PyUnicodeTranslateError_Create(
7715 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 }
7717 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7719 goto onError;
7720 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7721 goto onError;
7722 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7723 goto onError;
7724 return;
7725 onError:
7726 Py_DECREF(*exceptionObject);
7727 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 }
7729}
7730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007732static void
7733raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007735 Py_ssize_t startpos, Py_ssize_t endpos,
7736 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737{
7738 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007739 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007740 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007742}
7743
7744/* error handling callback helper:
7745 build arguments, call the callback and check the arguments,
7746 put the result into newpos and return the replacement string, which
7747 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007748static PyObject *
7749unicode_translate_call_errorhandler(const char *errors,
7750 PyObject **errorHandler,
7751 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007752 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753 Py_ssize_t startpos, Py_ssize_t endpos,
7754 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007755{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007756 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007758 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007759 PyObject *restuple;
7760 PyObject *resunicode;
7761
7762 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007764 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007766 }
7767
7768 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007769 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007770 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007772
7773 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007775 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007777 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007778 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 Py_DECREF(restuple);
7780 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007781 }
7782 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 &resunicode, &i_newpos)) {
7784 Py_DECREF(restuple);
7785 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007787 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007789 else
7790 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7793 Py_DECREF(restuple);
7794 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007795 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796 Py_INCREF(resunicode);
7797 Py_DECREF(restuple);
7798 return resunicode;
7799}
7800
7801/* Lookup the character ch in the mapping and put the result in result,
7802 which must be decrefed by the caller.
7803 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007804static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007805charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007806{
Christian Heimes217cfd12007-12-02 14:31:20 +00007807 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007808 PyObject *x;
7809
7810 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812 x = PyObject_GetItem(mapping, w);
7813 Py_DECREF(w);
7814 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7816 /* No mapping found means: use 1:1 mapping. */
7817 PyErr_Clear();
7818 *result = NULL;
7819 return 0;
7820 } else
7821 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007822 }
7823 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 *result = x;
7825 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007827 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 long value = PyLong_AS_LONG(x);
7829 long max = PyUnicode_GetMax();
7830 if (value < 0 || value > max) {
7831 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007832 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 Py_DECREF(x);
7834 return -1;
7835 }
7836 *result = x;
7837 return 0;
7838 }
7839 else if (PyUnicode_Check(x)) {
7840 *result = x;
7841 return 0;
7842 }
7843 else {
7844 /* wrong return value */
7845 PyErr_SetString(PyExc_TypeError,
7846 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 Py_DECREF(x);
7848 return -1;
7849 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007850}
7851/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 if not reallocate and adjust various state variables.
7853 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007854static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007858 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007859 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 /* exponentially overallocate to minimize reallocations */
7861 if (requiredsize < 2 * oldsize)
7862 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7864 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007867 }
7868 return 0;
7869}
7870/* lookup the character, put the result in the output string and adjust
7871 various state variables. Return a new reference to the object that
7872 was put in the output buffer in *result, or Py_None, if the mapping was
7873 undefined (in which case no character was written).
7874 The called must decref result.
7875 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007876static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7878 PyObject *mapping, Py_UCS4 **output,
7879 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007880 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007882 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7883 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007887 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007888 }
7889 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007891 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894 }
7895 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007896 Py_ssize_t repsize;
7897 if (PyUnicode_READY(*res) == -1)
7898 return -1;
7899 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 if (repsize==1) {
7901 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 }
7904 else if (repsize!=0) {
7905 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 Py_ssize_t requiredsize = *opos +
7907 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 Py_ssize_t i;
7910 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007912 for(i = 0; i < repsize; i++)
7913 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007915 }
7916 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007918 return 0;
7919}
7920
Alexander Belopolsky40018472011-02-26 01:02:56 +00007921PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007922_PyUnicode_TranslateCharmap(PyObject *input,
7923 PyObject *mapping,
7924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007926 /* input object */
7927 char *idata;
7928 Py_ssize_t size, i;
7929 int kind;
7930 /* output buffer */
7931 Py_UCS4 *output = NULL;
7932 Py_ssize_t osize;
7933 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007935 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936 char *reason = "character maps to <undefined>";
7937 PyObject *errorHandler = NULL;
7938 PyObject *exc = NULL;
7939 /* the following variable is used for caching string comparisons
7940 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7941 * 3=ignore, 4=xmlcharrefreplace */
7942 int known_errorHandler = -1;
7943
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 PyErr_BadArgument();
7946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007949 if (PyUnicode_READY(input) == -1)
7950 return NULL;
7951 idata = (char*)PyUnicode_DATA(input);
7952 kind = PyUnicode_KIND(input);
7953 size = PyUnicode_GET_LENGTH(input);
7954 i = 0;
7955
7956 if (size == 0) {
7957 Py_INCREF(input);
7958 return input;
7959 }
7960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007961 /* allocate enough for a simple 1:1 translation without
7962 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 osize = size;
7964 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7965 opos = 0;
7966 if (output == NULL) {
7967 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 /* try to encode it */
7973 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974 if (charmaptranslate_output(input, i, mapping,
7975 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 Py_XDECREF(x);
7977 goto onError;
7978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007979 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 else { /* untranslatable character */
7983 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7984 Py_ssize_t repsize;
7985 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007986 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 Py_ssize_t collstart = i;
7989 Py_ssize_t collend = i+1;
7990 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007993 while (collend < size) {
7994 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 goto onError;
7996 Py_XDECREF(x);
7997 if (x!=Py_None)
7998 break;
7999 ++collend;
8000 }
8001 /* cache callback name lookup
8002 * (if not done yet, i.e. it's the first error) */
8003 if (known_errorHandler==-1) {
8004 if ((errors==NULL) || (!strcmp(errors, "strict")))
8005 known_errorHandler = 1;
8006 else if (!strcmp(errors, "replace"))
8007 known_errorHandler = 2;
8008 else if (!strcmp(errors, "ignore"))
8009 known_errorHandler = 3;
8010 else if (!strcmp(errors, "xmlcharrefreplace"))
8011 known_errorHandler = 4;
8012 else
8013 known_errorHandler = 0;
8014 }
8015 switch (known_errorHandler) {
8016 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008017 raise_translate_exception(&exc, input, collstart,
8018 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 case 2: /* replace */
8021 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008022 for (coll = collstart; coll<collend; coll++)
8023 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 /* fall through */
8025 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 break;
8028 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 /* generate replacement (temporarily (mis)uses i) */
8030 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 char buffer[2+29+1+1];
8032 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008033 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8034 if (charmaptranslate_makespace(&output, &osize,
8035 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 goto onError;
8037 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008040 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 break;
8042 default:
8043 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 reason, input, &exc,
8045 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008046 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 goto onError;
8048 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049 repsize = PyUnicode_GET_LENGTH(repunicode);
8050 if (charmaptranslate_makespace(&output, &osize,
8051 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 Py_DECREF(repunicode);
8053 goto onError;
8054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008055 for (uni2 = 0; repsize-->0; ++uni2)
8056 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8057 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008059 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 }
8061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008062 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8063 if (!res)
8064 goto onError;
8065 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008066 Py_XDECREF(exc);
8067 Py_XDECREF(errorHandler);
8068 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008071 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008072 Py_XDECREF(exc);
8073 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074 return NULL;
8075}
8076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077/* Deprecated. Use PyUnicode_Translate instead. */
8078PyObject *
8079PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8080 Py_ssize_t size,
8081 PyObject *mapping,
8082 const char *errors)
8083{
8084 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8085 if (!unicode)
8086 return NULL;
8087 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8088}
8089
Alexander Belopolsky40018472011-02-26 01:02:56 +00008090PyObject *
8091PyUnicode_Translate(PyObject *str,
8092 PyObject *mapping,
8093 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094{
8095 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008096
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 str = PyUnicode_FromObject(str);
8098 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 Py_DECREF(str);
8102 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008103
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 Py_XDECREF(str);
8106 return NULL;
8107}
Tim Petersced69f82003-09-16 20:30:58 +00008108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008110fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111{
8112 /* No need to call PyUnicode_READY(self) because this function is only
8113 called as a callback from fixup() which does it already. */
8114 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8115 const int kind = PyUnicode_KIND(self);
8116 void *data = PyUnicode_DATA(self);
8117 Py_UCS4 maxchar = 0, ch, fixed;
8118 Py_ssize_t i;
8119
8120 for (i = 0; i < len; ++i) {
8121 ch = PyUnicode_READ(kind, data, i);
8122 fixed = 0;
8123 if (ch > 127) {
8124 if (Py_UNICODE_ISSPACE(ch))
8125 fixed = ' ';
8126 else {
8127 const int decimal = Py_UNICODE_TODECIMAL(ch);
8128 if (decimal >= 0)
8129 fixed = '0' + decimal;
8130 }
8131 if (fixed != 0) {
8132 if (fixed > maxchar)
8133 maxchar = fixed;
8134 PyUnicode_WRITE(kind, data, i, fixed);
8135 }
8136 else if (ch > maxchar)
8137 maxchar = ch;
8138 }
8139 else if (ch > maxchar)
8140 maxchar = ch;
8141 }
8142
8143 return maxchar;
8144}
8145
8146PyObject *
8147_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8148{
8149 if (!PyUnicode_Check(unicode)) {
8150 PyErr_BadInternalCall();
8151 return NULL;
8152 }
8153 if (PyUnicode_READY(unicode) == -1)
8154 return NULL;
8155 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8156 /* If the string is already ASCII, just return the same string */
8157 Py_INCREF(unicode);
8158 return unicode;
8159 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008160 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161}
8162
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008163PyObject *
8164PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8165 Py_ssize_t length)
8166{
8167 PyObject *result;
8168 Py_UNICODE *p; /* write pointer into result */
8169 Py_ssize_t i;
8170 /* Copy to a new string */
8171 result = (PyObject *)_PyUnicode_New(length);
8172 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8173 if (result == NULL)
8174 return result;
8175 p = PyUnicode_AS_UNICODE(result);
8176 /* Iterate over code points */
8177 for (i = 0; i < length; i++) {
8178 Py_UNICODE ch =s[i];
8179 if (ch > 127) {
8180 int decimal = Py_UNICODE_TODECIMAL(ch);
8181 if (decimal >= 0)
8182 p[i] = '0' + decimal;
8183 }
8184 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008185#ifndef DONT_MAKE_RESULT_READY
8186 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 Py_DECREF(result);
8188 return NULL;
8189 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008190#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008191 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008192 return result;
8193}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008194/* --- Decimal Encoder ---------------------------------------------------- */
8195
Alexander Belopolsky40018472011-02-26 01:02:56 +00008196int
8197PyUnicode_EncodeDecimal(Py_UNICODE *s,
8198 Py_ssize_t length,
8199 char *output,
8200 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008201{
8202 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203 PyObject *errorHandler = NULL;
8204 PyObject *exc = NULL;
8205 const char *encoding = "decimal";
8206 const char *reason = "invalid decimal Unicode string";
8207 /* the following variable is used for caching string comparisons
8208 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8209 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008210
8211 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 PyErr_BadArgument();
8213 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008214 }
8215
8216 p = s;
8217 end = s + length;
8218 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 register Py_UNICODE ch = *p;
8220 int decimal;
8221 PyObject *repunicode;
8222 Py_ssize_t repsize;
8223 Py_ssize_t newpos;
8224 Py_UNICODE *uni2;
8225 Py_UNICODE *collstart;
8226 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008227
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 ++p;
8231 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 decimal = Py_UNICODE_TODECIMAL(ch);
8234 if (decimal >= 0) {
8235 *output++ = '0' + decimal;
8236 ++p;
8237 continue;
8238 }
8239 if (0 < ch && ch < 256) {
8240 *output++ = (char)ch;
8241 ++p;
8242 continue;
8243 }
8244 /* All other characters are considered unencodable */
8245 collstart = p;
8246 collend = p+1;
8247 while (collend < end) {
8248 if ((0 < *collend && *collend < 256) ||
8249 !Py_UNICODE_ISSPACE(*collend) ||
8250 Py_UNICODE_TODECIMAL(*collend))
8251 break;
8252 }
8253 /* cache callback name lookup
8254 * (if not done yet, i.e. it's the first error) */
8255 if (known_errorHandler==-1) {
8256 if ((errors==NULL) || (!strcmp(errors, "strict")))
8257 known_errorHandler = 1;
8258 else if (!strcmp(errors, "replace"))
8259 known_errorHandler = 2;
8260 else if (!strcmp(errors, "ignore"))
8261 known_errorHandler = 3;
8262 else if (!strcmp(errors, "xmlcharrefreplace"))
8263 known_errorHandler = 4;
8264 else
8265 known_errorHandler = 0;
8266 }
8267 switch (known_errorHandler) {
8268 case 1: /* strict */
8269 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8270 goto onError;
8271 case 2: /* replace */
8272 for (p = collstart; p < collend; ++p)
8273 *output++ = '?';
8274 /* fall through */
8275 case 3: /* ignore */
8276 p = collend;
8277 break;
8278 case 4: /* xmlcharrefreplace */
8279 /* generate replacement (temporarily (mis)uses p) */
8280 for (p = collstart; p < collend; ++p)
8281 output += sprintf(output, "&#%d;", (int)*p);
8282 p = collend;
8283 break;
8284 default:
8285 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8286 encoding, reason, s, length, &exc,
8287 collstart-s, collend-s, &newpos);
8288 if (repunicode == NULL)
8289 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008290 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008291 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008292 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8293 Py_DECREF(repunicode);
8294 goto onError;
8295 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 /* generate replacement */
8297 repsize = PyUnicode_GET_SIZE(repunicode);
8298 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8299 Py_UNICODE ch = *uni2;
8300 if (Py_UNICODE_ISSPACE(ch))
8301 *output++ = ' ';
8302 else {
8303 decimal = Py_UNICODE_TODECIMAL(ch);
8304 if (decimal >= 0)
8305 *output++ = '0' + decimal;
8306 else if (0 < ch && ch < 256)
8307 *output++ = (char)ch;
8308 else {
8309 Py_DECREF(repunicode);
8310 raise_encode_exception(&exc, encoding,
8311 s, length, collstart-s, collend-s, reason);
8312 goto onError;
8313 }
8314 }
8315 }
8316 p = s + newpos;
8317 Py_DECREF(repunicode);
8318 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008319 }
8320 /* 0-terminate the output string */
8321 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322 Py_XDECREF(exc);
8323 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008324 return 0;
8325
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327 Py_XDECREF(exc);
8328 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008329 return -1;
8330}
8331
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332/* --- Helpers ------------------------------------------------------------ */
8333
Victor Stinnerc3cec782011-10-05 21:24:08 +02008334#include "stringlib/asciilib.h"
8335#include "stringlib/fastsearch.h"
8336#include "stringlib/partition.h"
8337#include "stringlib/split.h"
8338#include "stringlib/count.h"
8339#include "stringlib/find.h"
8340#include "stringlib/localeutil.h"
8341#include "stringlib/undef.h"
8342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343#include "stringlib/ucs1lib.h"
8344#include "stringlib/fastsearch.h"
8345#include "stringlib/partition.h"
8346#include "stringlib/split.h"
8347#include "stringlib/count.h"
8348#include "stringlib/find.h"
8349#include "stringlib/localeutil.h"
8350#include "stringlib/undef.h"
8351
8352#include "stringlib/ucs2lib.h"
8353#include "stringlib/fastsearch.h"
8354#include "stringlib/partition.h"
8355#include "stringlib/split.h"
8356#include "stringlib/count.h"
8357#include "stringlib/find.h"
8358#include "stringlib/localeutil.h"
8359#include "stringlib/undef.h"
8360
8361#include "stringlib/ucs4lib.h"
8362#include "stringlib/fastsearch.h"
8363#include "stringlib/partition.h"
8364#include "stringlib/split.h"
8365#include "stringlib/count.h"
8366#include "stringlib/find.h"
8367#include "stringlib/localeutil.h"
8368#include "stringlib/undef.h"
8369
8370static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008371any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8372 const Py_UCS1*, Py_ssize_t,
8373 Py_ssize_t, Py_ssize_t),
8374 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 const Py_UCS1*, Py_ssize_t,
8376 Py_ssize_t, Py_ssize_t),
8377 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8378 const Py_UCS2*, Py_ssize_t,
8379 Py_ssize_t, Py_ssize_t),
8380 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8381 const Py_UCS4*, Py_ssize_t,
8382 Py_ssize_t, Py_ssize_t),
8383 PyObject* s1, PyObject* s2,
8384 Py_ssize_t start,
8385 Py_ssize_t end)
8386{
8387 int kind1, kind2, kind;
8388 void *buf1, *buf2;
8389 Py_ssize_t len1, len2, result;
8390
8391 kind1 = PyUnicode_KIND(s1);
8392 kind2 = PyUnicode_KIND(s2);
8393 kind = kind1 > kind2 ? kind1 : kind2;
8394 buf1 = PyUnicode_DATA(s1);
8395 buf2 = PyUnicode_DATA(s2);
8396 if (kind1 != kind)
8397 buf1 = _PyUnicode_AsKind(s1, kind);
8398 if (!buf1)
8399 return -2;
8400 if (kind2 != kind)
8401 buf2 = _PyUnicode_AsKind(s2, kind);
8402 if (!buf2) {
8403 if (kind1 != kind) PyMem_Free(buf1);
8404 return -2;
8405 }
8406 len1 = PyUnicode_GET_LENGTH(s1);
8407 len2 = PyUnicode_GET_LENGTH(s2);
8408
8409 switch(kind) {
8410 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008411 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8412 result = ascii(buf1, len1, buf2, len2, start, end);
8413 else
8414 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 break;
8416 case PyUnicode_2BYTE_KIND:
8417 result = ucs2(buf1, len1, buf2, len2, start, end);
8418 break;
8419 case PyUnicode_4BYTE_KIND:
8420 result = ucs4(buf1, len1, buf2, len2, start, end);
8421 break;
8422 default:
8423 assert(0); result = -2;
8424 }
8425
8426 if (kind1 != kind)
8427 PyMem_Free(buf1);
8428 if (kind2 != kind)
8429 PyMem_Free(buf2);
8430
8431 return result;
8432}
8433
8434Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008435_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436 Py_ssize_t n_buffer,
8437 void *digits, Py_ssize_t n_digits,
8438 Py_ssize_t min_width,
8439 const char *grouping,
8440 const char *thousands_sep)
8441{
8442 switch(kind) {
8443 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008444 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8445 return _PyUnicode_ascii_InsertThousandsGrouping(
8446 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8447 min_width, grouping, thousands_sep);
8448 else
8449 return _PyUnicode_ucs1_InsertThousandsGrouping(
8450 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8451 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 case PyUnicode_2BYTE_KIND:
8453 return _PyUnicode_ucs2_InsertThousandsGrouping(
8454 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8455 min_width, grouping, thousands_sep);
8456 case PyUnicode_4BYTE_KIND:
8457 return _PyUnicode_ucs4_InsertThousandsGrouping(
8458 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8459 min_width, grouping, thousands_sep);
8460 }
8461 assert(0);
8462 return -1;
8463}
8464
8465
Eric Smith8c663262007-08-25 02:26:07 +00008466#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008467#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008468
Thomas Wouters477c8d52006-05-27 19:21:47 +00008469#include "stringlib/count.h"
8470#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008471
Thomas Wouters477c8d52006-05-27 19:21:47 +00008472/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008473#define ADJUST_INDICES(start, end, len) \
8474 if (end > len) \
8475 end = len; \
8476 else if (end < 0) { \
8477 end += len; \
8478 if (end < 0) \
8479 end = 0; \
8480 } \
8481 if (start < 0) { \
8482 start += len; \
8483 if (start < 0) \
8484 start = 0; \
8485 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008486
Alexander Belopolsky40018472011-02-26 01:02:56 +00008487Py_ssize_t
8488PyUnicode_Count(PyObject *str,
8489 PyObject *substr,
8490 Py_ssize_t start,
8491 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008493 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008494 PyUnicodeObject* str_obj;
8495 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 int kind1, kind2, kind;
8497 void *buf1 = NULL, *buf2 = NULL;
8498 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008499
Thomas Wouters477c8d52006-05-27 19:21:47 +00008500 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008503 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008504 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 Py_DECREF(str_obj);
8506 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 }
Tim Petersced69f82003-09-16 20:30:58 +00008508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 kind1 = PyUnicode_KIND(str_obj);
8510 kind2 = PyUnicode_KIND(sub_obj);
8511 kind = kind1 > kind2 ? kind1 : kind2;
8512 buf1 = PyUnicode_DATA(str_obj);
8513 if (kind1 != kind)
8514 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8515 if (!buf1)
8516 goto onError;
8517 buf2 = PyUnicode_DATA(sub_obj);
8518 if (kind2 != kind)
8519 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8520 if (!buf2)
8521 goto onError;
8522 len1 = PyUnicode_GET_LENGTH(str_obj);
8523 len2 = PyUnicode_GET_LENGTH(sub_obj);
8524
8525 ADJUST_INDICES(start, end, len1);
8526 switch(kind) {
8527 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008528 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8529 result = asciilib_count(
8530 ((Py_UCS1*)buf1) + start, end - start,
8531 buf2, len2, PY_SSIZE_T_MAX
8532 );
8533 else
8534 result = ucs1lib_count(
8535 ((Py_UCS1*)buf1) + start, end - start,
8536 buf2, len2, PY_SSIZE_T_MAX
8537 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 break;
8539 case PyUnicode_2BYTE_KIND:
8540 result = ucs2lib_count(
8541 ((Py_UCS2*)buf1) + start, end - start,
8542 buf2, len2, PY_SSIZE_T_MAX
8543 );
8544 break;
8545 case PyUnicode_4BYTE_KIND:
8546 result = ucs4lib_count(
8547 ((Py_UCS4*)buf1) + start, end - start,
8548 buf2, len2, PY_SSIZE_T_MAX
8549 );
8550 break;
8551 default:
8552 assert(0); result = 0;
8553 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008554
8555 Py_DECREF(sub_obj);
8556 Py_DECREF(str_obj);
8557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 if (kind1 != kind)
8559 PyMem_Free(buf1);
8560 if (kind2 != kind)
8561 PyMem_Free(buf2);
8562
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 onError:
8565 Py_DECREF(sub_obj);
8566 Py_DECREF(str_obj);
8567 if (kind1 != kind && buf1)
8568 PyMem_Free(buf1);
8569 if (kind2 != kind && buf2)
8570 PyMem_Free(buf2);
8571 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572}
8573
Alexander Belopolsky40018472011-02-26 01:02:56 +00008574Py_ssize_t
8575PyUnicode_Find(PyObject *str,
8576 PyObject *sub,
8577 Py_ssize_t start,
8578 Py_ssize_t end,
8579 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008581 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008582
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008586 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 Py_DECREF(str);
8589 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 }
Tim Petersced69f82003-09-16 20:30:58 +00008591
Thomas Wouters477c8d52006-05-27 19:21:47 +00008592 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008594 asciilib_find_slice, ucs1lib_find_slice,
8595 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008597 );
8598 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008600 asciilib_find_slice, ucs1lib_rfind_slice,
8601 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008603 );
8604
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008606 Py_DECREF(sub);
8607
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 return result;
8609}
8610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611Py_ssize_t
8612PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8613 Py_ssize_t start, Py_ssize_t end,
8614 int direction)
8615{
8616 char *result;
8617 int kind;
8618 if (PyUnicode_READY(str) == -1)
8619 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008620 if (start < 0 || end < 0) {
8621 PyErr_SetString(PyExc_IndexError, "string index out of range");
8622 return -2;
8623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 if (end > PyUnicode_GET_LENGTH(str))
8625 end = PyUnicode_GET_LENGTH(str);
8626 kind = PyUnicode_KIND(str);
8627 result = findchar(PyUnicode_1BYTE_DATA(str)
8628 + PyUnicode_KIND_SIZE(kind, start),
8629 kind,
8630 end-start, ch, direction);
8631 if (!result)
8632 return -1;
8633 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8634}
8635
Alexander Belopolsky40018472011-02-26 01:02:56 +00008636static int
8637tailmatch(PyUnicodeObject *self,
8638 PyUnicodeObject *substring,
8639 Py_ssize_t start,
8640 Py_ssize_t end,
8641 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 int kind_self;
8644 int kind_sub;
8645 void *data_self;
8646 void *data_sub;
8647 Py_ssize_t offset;
8648 Py_ssize_t i;
8649 Py_ssize_t end_sub;
8650
8651 if (PyUnicode_READY(self) == -1 ||
8652 PyUnicode_READY(substring) == -1)
8653 return 0;
8654
8655 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 return 1;
8657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8659 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 kind_self = PyUnicode_KIND(self);
8664 data_self = PyUnicode_DATA(self);
8665 kind_sub = PyUnicode_KIND(substring);
8666 data_sub = PyUnicode_DATA(substring);
8667 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8668
8669 if (direction > 0)
8670 offset = end;
8671 else
8672 offset = start;
8673
8674 if (PyUnicode_READ(kind_self, data_self, offset) ==
8675 PyUnicode_READ(kind_sub, data_sub, 0) &&
8676 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8677 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8678 /* If both are of the same kind, memcmp is sufficient */
8679 if (kind_self == kind_sub) {
8680 return ! memcmp((char *)data_self +
8681 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8682 data_sub,
8683 PyUnicode_GET_LENGTH(substring) *
8684 PyUnicode_CHARACTER_SIZE(substring));
8685 }
8686 /* otherwise we have to compare each character by first accesing it */
8687 else {
8688 /* We do not need to compare 0 and len(substring)-1 because
8689 the if statement above ensured already that they are equal
8690 when we end up here. */
8691 // TODO: honor direction and do a forward or backwards search
8692 for (i = 1; i < end_sub; ++i) {
8693 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8694 PyUnicode_READ(kind_sub, data_sub, i))
8695 return 0;
8696 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 }
8700
8701 return 0;
8702}
8703
Alexander Belopolsky40018472011-02-26 01:02:56 +00008704Py_ssize_t
8705PyUnicode_Tailmatch(PyObject *str,
8706 PyObject *substr,
8707 Py_ssize_t start,
8708 Py_ssize_t end,
8709 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008711 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008712
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 str = PyUnicode_FromObject(str);
8714 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 substr = PyUnicode_FromObject(substr);
8717 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 Py_DECREF(str);
8719 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 }
Tim Petersced69f82003-09-16 20:30:58 +00008721
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 (PyUnicodeObject *)substr,
8724 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 Py_DECREF(str);
8726 Py_DECREF(substr);
8727 return result;
8728}
8729
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730/* Apply fixfct filter to the Unicode object self and return a
8731 reference to the modified object */
8732
Alexander Belopolsky40018472011-02-26 01:02:56 +00008733static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008734fixup(PyObject *self,
8735 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 PyObject *u;
8738 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 if (PyUnicode_READY(self) == -1)
8741 return NULL;
8742 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8743 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8744 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8749 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 /* fix functions return the new maximum character in a string,
8752 if the kind of the resulting unicode object does not change,
8753 everything is fine. Otherwise we need to change the string kind
8754 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008755 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 if (maxchar_new == 0)
8757 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8758 else if (maxchar_new <= 127)
8759 maxchar_new = 127;
8760 else if (maxchar_new <= 255)
8761 maxchar_new = 255;
8762 else if (maxchar_new <= 65535)
8763 maxchar_new = 65535;
8764 else
8765 maxchar_new = 1114111; /* 0x10ffff */
8766
8767 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 /* fixfct should return TRUE if it modified the buffer. If
8769 FALSE, return a reference to the original buffer instead
8770 (to save space, not time) */
8771 Py_INCREF(self);
8772 Py_DECREF(u);
8773 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 else if (maxchar_new == maxchar_old) {
8776 return u;
8777 }
8778 else {
8779 /* In case the maximum character changed, we need to
8780 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008781 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 if (v == NULL) {
8783 Py_DECREF(u);
8784 return NULL;
8785 }
8786 if (maxchar_new > maxchar_old) {
8787 /* If the maxchar increased so that the kind changed, not all
8788 characters are representable anymore and we need to fix the
8789 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008790 if (PyUnicode_CopyCharacters(v, 0,
8791 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008792 PyUnicode_GET_LENGTH(self)) < 0)
8793 {
8794 Py_DECREF(u);
8795 return NULL;
8796 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008797 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8799 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008800 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008801 if (PyUnicode_CopyCharacters(v, 0,
8802 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008803 PyUnicode_GET_LENGTH(self)) < 0)
8804 {
8805 Py_DECREF(u);
8806 return NULL;
8807 }
8808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809
8810 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008811 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 return v;
8813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814}
8815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008817fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 /* No need to call PyUnicode_READY(self) because this function is only
8820 called as a callback from fixup() which does it already. */
8821 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8822 const int kind = PyUnicode_KIND(self);
8823 void *data = PyUnicode_DATA(self);
8824 int touched = 0;
8825 Py_UCS4 maxchar = 0;
8826 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 for (i = 0; i < len; ++i) {
8829 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8830 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8831 if (up != ch) {
8832 if (up > maxchar)
8833 maxchar = up;
8834 PyUnicode_WRITE(kind, data, i, up);
8835 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 else if (ch > maxchar)
8838 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 }
8840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 if (touched)
8842 return maxchar;
8843 else
8844 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845}
8846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008848fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8851 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8852 const int kind = PyUnicode_KIND(self);
8853 void *data = PyUnicode_DATA(self);
8854 int touched = 0;
8855 Py_UCS4 maxchar = 0;
8856 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 for(i = 0; i < len; ++i) {
8859 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8860 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8861 if (lo != ch) {
8862 if (lo > maxchar)
8863 maxchar = lo;
8864 PyUnicode_WRITE(kind, data, i, lo);
8865 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 else if (ch > maxchar)
8868 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 }
8870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 if (touched)
8872 return maxchar;
8873 else
8874 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875}
8876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008878fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8881 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8882 const int kind = PyUnicode_KIND(self);
8883 void *data = PyUnicode_DATA(self);
8884 int touched = 0;
8885 Py_UCS4 maxchar = 0;
8886 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 for(i = 0; i < len; ++i) {
8889 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8890 Py_UCS4 nu = 0;
8891
8892 if (Py_UNICODE_ISUPPER(ch))
8893 nu = Py_UNICODE_TOLOWER(ch);
8894 else if (Py_UNICODE_ISLOWER(ch))
8895 nu = Py_UNICODE_TOUPPER(ch);
8896
8897 if (nu != 0) {
8898 if (nu > maxchar)
8899 maxchar = nu;
8900 PyUnicode_WRITE(kind, data, i, nu);
8901 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 else if (ch > maxchar)
8904 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 }
8906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 if (touched)
8908 return maxchar;
8909 else
8910 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911}
8912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008914fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8917 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8918 const int kind = PyUnicode_KIND(self);
8919 void *data = PyUnicode_DATA(self);
8920 int touched = 0;
8921 Py_UCS4 maxchar = 0;
8922 Py_ssize_t i = 0;
8923 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008924
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008925 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927
8928 ch = PyUnicode_READ(kind, data, i);
8929 if (!Py_UNICODE_ISUPPER(ch)) {
8930 maxchar = Py_UNICODE_TOUPPER(ch);
8931 PyUnicode_WRITE(kind, data, i, maxchar);
8932 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 ++i;
8935 for(; i < len; ++i) {
8936 ch = PyUnicode_READ(kind, data, i);
8937 if (!Py_UNICODE_ISLOWER(ch)) {
8938 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8939 if (lo > maxchar)
8940 maxchar = lo;
8941 PyUnicode_WRITE(kind, data, i, lo);
8942 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 else if (ch > maxchar)
8945 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947
8948 if (touched)
8949 return maxchar;
8950 else
8951 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952}
8953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008955fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8958 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8959 const int kind = PyUnicode_KIND(self);
8960 void *data = PyUnicode_DATA(self);
8961 Py_UCS4 maxchar = 0;
8962 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 int previous_is_cased;
8964
8965 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 if (len == 1) {
8967 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8968 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8969 if (ti != ch) {
8970 PyUnicode_WRITE(kind, data, i, ti);
8971 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 }
8973 else
8974 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 for(; i < len; ++i) {
8978 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8979 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008980
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 nu = Py_UNICODE_TOTITLE(ch);
8985
8986 if (nu > maxchar)
8987 maxchar = nu;
8988 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008989
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 if (Py_UNICODE_ISLOWER(ch) ||
8991 Py_UNICODE_ISUPPER(ch) ||
8992 Py_UNICODE_ISTITLE(ch))
8993 previous_is_cased = 1;
8994 else
8995 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998}
8999
Tim Peters8ce9f162004-08-27 01:49:32 +00009000PyObject *
9001PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009004 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009006 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009007 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9008 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009009 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 Py_ssize_t sz, i, res_offset;
9011 Py_UCS4 maxchar = 0;
9012 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013
Tim Peters05eba1f2004-08-27 21:32:02 +00009014 fseq = PySequence_Fast(seq, "");
9015 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009016 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009017 }
9018
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009019 /* NOTE: the following code can't call back into Python code,
9020 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009021 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009022
Tim Peters05eba1f2004-08-27 21:32:02 +00009023 seqlen = PySequence_Fast_GET_SIZE(fseq);
9024 /* If empty sequence, return u"". */
9025 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009027 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00009028 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009029 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009030 /* If singleton sequence with an exact Unicode, return that. */
9031 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 item = items[0];
9033 if (PyUnicode_CheckExact(item)) {
9034 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 goto Done;
9037 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009038 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009039 else {
9040 /* Set up sep and seplen */
9041 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 /* fall back to a blank space separator */
9043 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02009044 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00009046 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009047 else {
9048 if (!PyUnicode_Check(separator)) {
9049 PyErr_Format(PyExc_TypeError,
9050 "separator: expected str instance,"
9051 " %.80s found",
9052 Py_TYPE(separator)->tp_name);
9053 goto onError;
9054 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02009055 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 goto onError;
9057 sep = separator;
9058 seplen = PyUnicode_GET_LENGTH(separator);
9059 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
Georg Brandl7597add2011-10-05 16:36:47 +02009060 /* inc refcount to keep this code path symmetric with the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 above case of a blank separator */
9062 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00009063 }
9064 }
9065
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009066 /* There are at least two things to join, or else we have a subclass
9067 * of str in the sequence.
9068 * Do a pre-pass to figure out the total amount of space we'll
9069 * need (sz), and see whether all argument are strings.
9070 */
9071 sz = 0;
9072 for (i = 0; i < seqlen; i++) {
9073 const Py_ssize_t old_sz = sz;
9074 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 if (!PyUnicode_Check(item)) {
9076 PyErr_Format(PyExc_TypeError,
9077 "sequence item %zd: expected str instance,"
9078 " %.80s found",
9079 i, Py_TYPE(item)->tp_name);
9080 goto onError;
9081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 if (PyUnicode_READY(item) == -1)
9083 goto onError;
9084 sz += PyUnicode_GET_LENGTH(item);
9085 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9086 if (item_maxchar > maxchar)
9087 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009088 if (i != 0)
9089 sz += seplen;
9090 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9091 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009093 goto onError;
9094 }
9095 }
Tim Petersced69f82003-09-16 20:30:58 +00009096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009098 if (res == NULL)
9099 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009100
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009101 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009103 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009104 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009106 if (i && seplen != 0) {
9107 copied = PyUnicode_CopyCharacters(res, res_offset,
9108 sep, 0, seplen);
9109 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009110 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009111#ifdef Py_DEBUG
9112 res_offset += copied;
9113#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009115#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009116 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009117 itemlen = PyUnicode_GET_LENGTH(item);
9118 if (itemlen != 0) {
9119 copied = PyUnicode_CopyCharacters(res, res_offset,
9120 item, 0, itemlen);
9121 if (copied < 0)
9122 goto onError;
9123#ifdef Py_DEBUG
9124 res_offset += copied;
9125#else
9126 res_offset += itemlen;
9127#endif
9128 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009131
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009133 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009135 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009139 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009141 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 return NULL;
9143}
9144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145#define FILL(kind, data, value, start, length) \
9146 do { \
9147 Py_ssize_t i_ = 0; \
9148 assert(kind != PyUnicode_WCHAR_KIND); \
9149 switch ((kind)) { \
9150 case PyUnicode_1BYTE_KIND: { \
9151 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9152 memset(to_, (unsigned char)value, length); \
9153 break; \
9154 } \
9155 case PyUnicode_2BYTE_KIND: { \
9156 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9157 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9158 break; \
9159 } \
9160 default: { \
9161 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9162 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9163 break; \
9164 } \
9165 } \
9166 } while (0)
9167
Victor Stinner9310abb2011-10-05 00:59:23 +02009168static PyObject *
9169pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009170 Py_ssize_t left,
9171 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009172 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 PyObject *u;
9175 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009176 int kind;
9177 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178
9179 if (left < 0)
9180 left = 0;
9181 if (right < 0)
9182 right = 0;
9183
Tim Peters7a29bd52001-09-12 03:03:31 +00009184 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 Py_INCREF(self);
9186 return self;
9187 }
9188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9190 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009191 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9192 return NULL;
9193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9195 if (fill > maxchar)
9196 maxchar = fill;
9197 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009198 if (!u)
9199 return NULL;
9200
9201 kind = PyUnicode_KIND(u);
9202 data = PyUnicode_DATA(u);
9203 if (left)
9204 FILL(kind, data, fill, 0, left);
9205 if (right)
9206 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009207 if (PyUnicode_CopyCharacters(u, left,
9208 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009209 _PyUnicode_LENGTH(self)) < 0)
9210 {
9211 Py_DECREF(u);
9212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 }
9214
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009215 assert(_PyUnicode_CheckConsistency(u, 1));
9216 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219
Alexander Belopolsky40018472011-02-26 01:02:56 +00009220PyObject *
9221PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224
9225 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 switch(PyUnicode_KIND(string)) {
9230 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009231 if (PyUnicode_IS_ASCII(string))
9232 list = asciilib_splitlines(
9233 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9234 PyUnicode_GET_LENGTH(string), keepends);
9235 else
9236 list = ucs1lib_splitlines(
9237 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9238 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 break;
9240 case PyUnicode_2BYTE_KIND:
9241 list = ucs2lib_splitlines(
9242 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9243 PyUnicode_GET_LENGTH(string), keepends);
9244 break;
9245 case PyUnicode_4BYTE_KIND:
9246 list = ucs4lib_splitlines(
9247 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9248 PyUnicode_GET_LENGTH(string), keepends);
9249 break;
9250 default:
9251 assert(0);
9252 list = 0;
9253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254 Py_DECREF(string);
9255 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256}
9257
Alexander Belopolsky40018472011-02-26 01:02:56 +00009258static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009259split(PyObject *self,
9260 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009261 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 int kind1, kind2, kind;
9264 void *buf1, *buf2;
9265 Py_ssize_t len1, len2;
9266 PyObject* out;
9267
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009269 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 if (PyUnicode_READY(self) == -1)
9272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 if (substring == NULL)
9275 switch(PyUnicode_KIND(self)) {
9276 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009277 if (PyUnicode_IS_ASCII(self))
9278 return asciilib_split_whitespace(
9279 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9280 PyUnicode_GET_LENGTH(self), maxcount
9281 );
9282 else
9283 return ucs1lib_split_whitespace(
9284 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9285 PyUnicode_GET_LENGTH(self), maxcount
9286 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 case PyUnicode_2BYTE_KIND:
9288 return ucs2lib_split_whitespace(
9289 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9290 PyUnicode_GET_LENGTH(self), maxcount
9291 );
9292 case PyUnicode_4BYTE_KIND:
9293 return ucs4lib_split_whitespace(
9294 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9295 PyUnicode_GET_LENGTH(self), maxcount
9296 );
9297 default:
9298 assert(0);
9299 return NULL;
9300 }
9301
9302 if (PyUnicode_READY(substring) == -1)
9303 return NULL;
9304
9305 kind1 = PyUnicode_KIND(self);
9306 kind2 = PyUnicode_KIND(substring);
9307 kind = kind1 > kind2 ? kind1 : kind2;
9308 buf1 = PyUnicode_DATA(self);
9309 buf2 = PyUnicode_DATA(substring);
9310 if (kind1 != kind)
9311 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9312 if (!buf1)
9313 return NULL;
9314 if (kind2 != kind)
9315 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9316 if (!buf2) {
9317 if (kind1 != kind) PyMem_Free(buf1);
9318 return NULL;
9319 }
9320 len1 = PyUnicode_GET_LENGTH(self);
9321 len2 = PyUnicode_GET_LENGTH(substring);
9322
9323 switch(kind) {
9324 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009325 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9326 out = asciilib_split(
9327 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9328 else
9329 out = ucs1lib_split(
9330 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 break;
9332 case PyUnicode_2BYTE_KIND:
9333 out = ucs2lib_split(
9334 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9335 break;
9336 case PyUnicode_4BYTE_KIND:
9337 out = ucs4lib_split(
9338 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9339 break;
9340 default:
9341 out = NULL;
9342 }
9343 if (kind1 != kind)
9344 PyMem_Free(buf1);
9345 if (kind2 != kind)
9346 PyMem_Free(buf2);
9347 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348}
9349
Alexander Belopolsky40018472011-02-26 01:02:56 +00009350static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009351rsplit(PyObject *self,
9352 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 int kind1, kind2, kind;
9356 void *buf1, *buf2;
9357 Py_ssize_t len1, len2;
9358 PyObject* out;
9359
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009360 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009361 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 if (PyUnicode_READY(self) == -1)
9364 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 if (substring == NULL)
9367 switch(PyUnicode_KIND(self)) {
9368 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009369 if (PyUnicode_IS_ASCII(self))
9370 return asciilib_rsplit_whitespace(
9371 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9372 PyUnicode_GET_LENGTH(self), maxcount
9373 );
9374 else
9375 return ucs1lib_rsplit_whitespace(
9376 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9377 PyUnicode_GET_LENGTH(self), maxcount
9378 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 case PyUnicode_2BYTE_KIND:
9380 return ucs2lib_rsplit_whitespace(
9381 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9382 PyUnicode_GET_LENGTH(self), maxcount
9383 );
9384 case PyUnicode_4BYTE_KIND:
9385 return ucs4lib_rsplit_whitespace(
9386 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9387 PyUnicode_GET_LENGTH(self), maxcount
9388 );
9389 default:
9390 assert(0);
9391 return NULL;
9392 }
9393
9394 if (PyUnicode_READY(substring) == -1)
9395 return NULL;
9396
9397 kind1 = PyUnicode_KIND(self);
9398 kind2 = PyUnicode_KIND(substring);
9399 kind = kind1 > kind2 ? kind1 : kind2;
9400 buf1 = PyUnicode_DATA(self);
9401 buf2 = PyUnicode_DATA(substring);
9402 if (kind1 != kind)
9403 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9404 if (!buf1)
9405 return NULL;
9406 if (kind2 != kind)
9407 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9408 if (!buf2) {
9409 if (kind1 != kind) PyMem_Free(buf1);
9410 return NULL;
9411 }
9412 len1 = PyUnicode_GET_LENGTH(self);
9413 len2 = PyUnicode_GET_LENGTH(substring);
9414
9415 switch(kind) {
9416 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009417 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9418 out = asciilib_rsplit(
9419 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9420 else
9421 out = ucs1lib_rsplit(
9422 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 break;
9424 case PyUnicode_2BYTE_KIND:
9425 out = ucs2lib_rsplit(
9426 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9427 break;
9428 case PyUnicode_4BYTE_KIND:
9429 out = ucs4lib_rsplit(
9430 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9431 break;
9432 default:
9433 out = NULL;
9434 }
9435 if (kind1 != kind)
9436 PyMem_Free(buf1);
9437 if (kind2 != kind)
9438 PyMem_Free(buf2);
9439 return out;
9440}
9441
9442static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009443anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9444 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445{
9446 switch(kind) {
9447 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009448 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9449 return asciilib_find(buf1, len1, buf2, len2, offset);
9450 else
9451 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 case PyUnicode_2BYTE_KIND:
9453 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9454 case PyUnicode_4BYTE_KIND:
9455 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9456 }
9457 assert(0);
9458 return -1;
9459}
9460
9461static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009462anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9463 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464{
9465 switch(kind) {
9466 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009467 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9468 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9469 else
9470 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 case PyUnicode_2BYTE_KIND:
9472 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9473 case PyUnicode_4BYTE_KIND:
9474 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9475 }
9476 assert(0);
9477 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009478}
9479
Alexander Belopolsky40018472011-02-26 01:02:56 +00009480static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481replace(PyObject *self, PyObject *str1,
9482 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 PyObject *u;
9485 char *sbuf = PyUnicode_DATA(self);
9486 char *buf1 = PyUnicode_DATA(str1);
9487 char *buf2 = PyUnicode_DATA(str2);
9488 int srelease = 0, release1 = 0, release2 = 0;
9489 int skind = PyUnicode_KIND(self);
9490 int kind1 = PyUnicode_KIND(str1);
9491 int kind2 = PyUnicode_KIND(str2);
9492 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9493 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9494 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495
9496 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009499 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 if (skind < kind1)
9502 /* substring too wide to be present */
9503 goto nothing;
9504
9505 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009506 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009507 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009509 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009511 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 Py_UCS4 u1, u2, maxchar;
9513 int mayshrink, rkind;
9514 u1 = PyUnicode_READ_CHAR(str1, 0);
9515 if (!findchar(sbuf, PyUnicode_KIND(self),
9516 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009517 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 u2 = PyUnicode_READ_CHAR(str2, 0);
9519 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9520 /* Replacing u1 with u2 may cause a maxchar reduction in the
9521 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 if (u2 > maxchar) {
9523 maxchar = u2;
9524 mayshrink = 0;
9525 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009526 else
9527 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009529 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009531 if (PyUnicode_CopyCharacters(u, 0,
9532 (PyObject*)self, 0, slen) < 0)
9533 {
9534 Py_DECREF(u);
9535 return NULL;
9536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 rkind = PyUnicode_KIND(u);
9538 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9539 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009540 if (--maxcount < 0)
9541 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 if (mayshrink) {
9545 PyObject *tmp = u;
9546 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9547 PyUnicode_GET_LENGTH(tmp));
9548 Py_DECREF(tmp);
9549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 int rkind = skind;
9552 char *res;
9553 if (kind1 < rkind) {
9554 /* widen substring */
9555 buf1 = _PyUnicode_AsKind(str1, rkind);
9556 if (!buf1) goto error;
9557 release1 = 1;
9558 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009559 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009560 if (i < 0)
9561 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 if (rkind > kind2) {
9563 /* widen replacement */
9564 buf2 = _PyUnicode_AsKind(str2, rkind);
9565 if (!buf2) goto error;
9566 release2 = 1;
9567 }
9568 else if (rkind < kind2) {
9569 /* widen self and buf1 */
9570 rkind = kind2;
9571 if (release1) PyMem_Free(buf1);
9572 sbuf = _PyUnicode_AsKind(self, rkind);
9573 if (!sbuf) goto error;
9574 srelease = 1;
9575 buf1 = _PyUnicode_AsKind(str1, rkind);
9576 if (!buf1) goto error;
9577 release1 = 1;
9578 }
9579 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9580 if (!res) {
9581 PyErr_NoMemory();
9582 goto error;
9583 }
9584 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009585 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9587 buf2,
9588 PyUnicode_KIND_SIZE(rkind, len2));
9589 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009590
9591 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009592 i = anylib_find(rkind, self,
9593 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9594 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009595 if (i == -1)
9596 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9598 buf2,
9599 PyUnicode_KIND_SIZE(rkind, len2));
9600 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602
9603 u = PyUnicode_FromKindAndData(rkind, res, slen);
9604 PyMem_Free(res);
9605 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 Py_ssize_t n, i, j, ires;
9610 Py_ssize_t product, new_size;
9611 int rkind = skind;
9612 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 if (kind1 < rkind) {
9615 buf1 = _PyUnicode_AsKind(str1, rkind);
9616 if (!buf1) goto error;
9617 release1 = 1;
9618 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009619 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009620 if (n == 0)
9621 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 if (kind2 < rkind) {
9623 buf2 = _PyUnicode_AsKind(str2, rkind);
9624 if (!buf2) goto error;
9625 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 else if (kind2 > rkind) {
9628 rkind = kind2;
9629 sbuf = _PyUnicode_AsKind(self, rkind);
9630 if (!sbuf) goto error;
9631 srelease = 1;
9632 if (release1) PyMem_Free(buf1);
9633 buf1 = _PyUnicode_AsKind(str1, rkind);
9634 if (!buf1) goto error;
9635 release1 = 1;
9636 }
9637 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9638 PyUnicode_GET_LENGTH(str1))); */
9639 product = n * (len2-len1);
9640 if ((product / (len2-len1)) != n) {
9641 PyErr_SetString(PyExc_OverflowError,
9642 "replace string is too long");
9643 goto error;
9644 }
9645 new_size = slen + product;
9646 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9647 PyErr_SetString(PyExc_OverflowError,
9648 "replace string is too long");
9649 goto error;
9650 }
9651 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9652 if (!res)
9653 goto error;
9654 ires = i = 0;
9655 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009656 while (n-- > 0) {
9657 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009658 j = anylib_find(rkind, self,
9659 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9660 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009661 if (j == -1)
9662 break;
9663 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009664 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9666 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9667 PyUnicode_KIND_SIZE(rkind, j-i));
9668 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009669 }
9670 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 if (len2 > 0) {
9672 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9673 buf2,
9674 PyUnicode_KIND_SIZE(rkind, len2));
9675 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009680 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9682 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9683 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009684 } else {
9685 /* interleave */
9686 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9688 buf2,
9689 PyUnicode_KIND_SIZE(rkind, len2));
9690 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009691 if (--n <= 0)
9692 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9694 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9695 PyUnicode_KIND_SIZE(rkind, 1));
9696 ires++;
9697 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9700 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9701 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009704 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 if (srelease)
9707 PyMem_FREE(sbuf);
9708 if (release1)
9709 PyMem_FREE(buf1);
9710 if (release2)
9711 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009712 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009714
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009716 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 if (srelease)
9718 PyMem_FREE(sbuf);
9719 if (release1)
9720 PyMem_FREE(buf1);
9721 if (release2)
9722 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009723 if (PyUnicode_CheckExact(self)) {
9724 Py_INCREF(self);
9725 return (PyObject *) self;
9726 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009727 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 error:
9729 if (srelease && sbuf)
9730 PyMem_FREE(sbuf);
9731 if (release1 && buf1)
9732 PyMem_FREE(buf1);
9733 if (release2 && buf2)
9734 PyMem_FREE(buf2);
9735 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736}
9737
9738/* --- Unicode Object Methods --------------------------------------------- */
9739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009740PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009741 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742\n\
9743Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009744characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745
9746static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009747unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749 return fixup(self, fixtitle);
9750}
9751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009752PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009753 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754\n\
9755Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009756have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757
9758static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009759unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761 return fixup(self, fixcapitalize);
9762}
9763
9764#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009765PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009766 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767\n\
9768Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009769normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770
9771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009772unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773{
9774 PyObject *list;
9775 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009776 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778 /* Split into words */
9779 list = split(self, NULL, -1);
9780 if (!list)
9781 return NULL;
9782
9783 /* Capitalize each word */
9784 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9785 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787 if (item == NULL)
9788 goto onError;
9789 Py_DECREF(PyList_GET_ITEM(list, i));
9790 PyList_SET_ITEM(list, i, item);
9791 }
9792
9793 /* Join the words to form a new string */
9794 item = PyUnicode_Join(NULL, list);
9795
Benjamin Peterson29060642009-01-31 22:14:21 +00009796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797 Py_DECREF(list);
9798 return (PyObject *)item;
9799}
9800#endif
9801
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009802/* Argument converter. Coerces to a single unicode character */
9803
9804static int
9805convert_uc(PyObject *obj, void *addr)
9806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009808 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009809
Benjamin Peterson14339b62009-01-31 16:36:08 +00009810 uniobj = PyUnicode_FromObject(obj);
9811 if (uniobj == NULL) {
9812 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009813 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009814 return 0;
9815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009817 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009819 Py_DECREF(uniobj);
9820 return 0;
9821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009823 Py_DECREF(uniobj);
9824 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009825}
9826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009827PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009828 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009830Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009831done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832
9833static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009834unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009836 Py_ssize_t marg, left;
9837 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 Py_UCS4 fillchar = ' ';
9839
Victor Stinnere9a29352011-10-01 02:14:59 +02009840 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842
Victor Stinnere9a29352011-10-01 02:14:59 +02009843 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844 return NULL;
9845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847 Py_INCREF(self);
9848 return (PyObject*) self;
9849 }
9850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852 left = marg / 2 + (marg & width & 1);
9853
Victor Stinner9310abb2011-10-05 00:59:23 +02009854 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855}
9856
Marc-André Lemburge5034372000-08-08 08:04:29 +00009857#if 0
9858
9859/* This code should go into some future Unicode collation support
9860 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009861 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009862
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009863/* speedy UTF-16 code point order comparison */
9864/* gleaned from: */
9865/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9866
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009867static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009868{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009869 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009870 0, 0, 0, 0, 0, 0, 0, 0,
9871 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009872 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009873};
9874
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875static int
9876unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9877{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009878 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009879
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880 Py_UNICODE *s1 = str1->str;
9881 Py_UNICODE *s2 = str2->str;
9882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 len1 = str1->_base._base.length;
9884 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009885
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009887 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009888
9889 c1 = *s1++;
9890 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009891
Benjamin Peterson29060642009-01-31 22:14:21 +00009892 if (c1 > (1<<11) * 26)
9893 c1 += utf16Fixup[c1>>11];
9894 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009895 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009896 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009897
9898 if (c1 != c2)
9899 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009900
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009901 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902 }
9903
9904 return (len1 < len2) ? -1 : (len1 != len2);
9905}
9906
Marc-André Lemburge5034372000-08-08 08:04:29 +00009907#else
9908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909/* This function assumes that str1 and str2 are readied by the caller. */
9910
Marc-André Lemburge5034372000-08-08 08:04:29 +00009911static int
9912unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 int kind1, kind2;
9915 void *data1, *data2;
9916 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 kind1 = PyUnicode_KIND(str1);
9919 kind2 = PyUnicode_KIND(str2);
9920 data1 = PyUnicode_DATA(str1);
9921 data2 = PyUnicode_DATA(str2);
9922 len1 = PyUnicode_GET_LENGTH(str1);
9923 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 for (i = 0; i < len1 && i < len2; ++i) {
9926 Py_UCS4 c1, c2;
9927 c1 = PyUnicode_READ(kind1, data1, i);
9928 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009929
9930 if (c1 != c2)
9931 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009932 }
9933
9934 return (len1 < len2) ? -1 : (len1 != len2);
9935}
9936
9937#endif
9938
Alexander Belopolsky40018472011-02-26 01:02:56 +00009939int
9940PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9943 if (PyUnicode_READY(left) == -1 ||
9944 PyUnicode_READY(right) == -1)
9945 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009946 return unicode_compare((PyUnicodeObject *)left,
9947 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009949 PyErr_Format(PyExc_TypeError,
9950 "Can't compare %.100s and %.100s",
9951 left->ob_type->tp_name,
9952 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953 return -1;
9954}
9955
Martin v. Löwis5b222132007-06-10 09:51:05 +00009956int
9957PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 Py_ssize_t i;
9960 int kind;
9961 void *data;
9962 Py_UCS4 chr;
9963
Victor Stinner910337b2011-10-03 03:20:16 +02009964 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 if (PyUnicode_READY(uni) == -1)
9966 return -1;
9967 kind = PyUnicode_KIND(uni);
9968 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009969 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9971 if (chr != str[i])
9972 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009973 /* This check keeps Python strings that end in '\0' from comparing equal
9974 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009976 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009977 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009978 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009979 return 0;
9980}
9981
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009982
Benjamin Peterson29060642009-01-31 22:14:21 +00009983#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009984 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009985
Alexander Belopolsky40018472011-02-26 01:02:56 +00009986PyObject *
9987PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009988{
9989 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009990
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009991 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9992 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 if (PyUnicode_READY(left) == -1 ||
9994 PyUnicode_READY(right) == -1)
9995 return NULL;
9996 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9997 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009998 if (op == Py_EQ) {
9999 Py_INCREF(Py_False);
10000 return Py_False;
10001 }
10002 if (op == Py_NE) {
10003 Py_INCREF(Py_True);
10004 return Py_True;
10005 }
10006 }
10007 if (left == right)
10008 result = 0;
10009 else
10010 result = unicode_compare((PyUnicodeObject *)left,
10011 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010012
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010013 /* Convert the return value to a Boolean */
10014 switch (op) {
10015 case Py_EQ:
10016 v = TEST_COND(result == 0);
10017 break;
10018 case Py_NE:
10019 v = TEST_COND(result != 0);
10020 break;
10021 case Py_LE:
10022 v = TEST_COND(result <= 0);
10023 break;
10024 case Py_GE:
10025 v = TEST_COND(result >= 0);
10026 break;
10027 case Py_LT:
10028 v = TEST_COND(result == -1);
10029 break;
10030 case Py_GT:
10031 v = TEST_COND(result == 1);
10032 break;
10033 default:
10034 PyErr_BadArgument();
10035 return NULL;
10036 }
10037 Py_INCREF(v);
10038 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010039 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010040
Brian Curtindfc80e32011-08-10 20:28:54 -050010041 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010042}
10043
Alexander Belopolsky40018472011-02-26 01:02:56 +000010044int
10045PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010046{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010047 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 int kind1, kind2, kind;
10049 void *buf1, *buf2;
10050 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010051 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010052
10053 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010054 sub = PyUnicode_FromObject(element);
10055 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010056 PyErr_Format(PyExc_TypeError,
10057 "'in <string>' requires string as left operand, not %s",
10058 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010059 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (PyUnicode_READY(sub) == -1)
10062 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010063
Thomas Wouters477c8d52006-05-27 19:21:47 +000010064 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010065 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010066 Py_DECREF(sub);
10067 return -1;
10068 }
10069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 kind1 = PyUnicode_KIND(str);
10071 kind2 = PyUnicode_KIND(sub);
10072 kind = kind1 > kind2 ? kind1 : kind2;
10073 buf1 = PyUnicode_DATA(str);
10074 buf2 = PyUnicode_DATA(sub);
10075 if (kind1 != kind)
10076 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10077 if (!buf1) {
10078 Py_DECREF(sub);
10079 return -1;
10080 }
10081 if (kind2 != kind)
10082 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10083 if (!buf2) {
10084 Py_DECREF(sub);
10085 if (kind1 != kind) PyMem_Free(buf1);
10086 return -1;
10087 }
10088 len1 = PyUnicode_GET_LENGTH(str);
10089 len2 = PyUnicode_GET_LENGTH(sub);
10090
10091 switch(kind) {
10092 case PyUnicode_1BYTE_KIND:
10093 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10094 break;
10095 case PyUnicode_2BYTE_KIND:
10096 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10097 break;
10098 case PyUnicode_4BYTE_KIND:
10099 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10100 break;
10101 default:
10102 result = -1;
10103 assert(0);
10104 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010105
10106 Py_DECREF(str);
10107 Py_DECREF(sub);
10108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (kind1 != kind)
10110 PyMem_Free(buf1);
10111 if (kind2 != kind)
10112 PyMem_Free(buf2);
10113
Guido van Rossum403d68b2000-03-13 15:55:09 +000010114 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010115}
10116
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117/* Concat to string or Unicode object giving a new Unicode object. */
10118
Alexander Belopolsky40018472011-02-26 01:02:56 +000010119PyObject *
10120PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 PyObject *u = NULL, *v = NULL, *w;
10123 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124
10125 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010128 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010130 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132
10133 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010134 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010138 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010139 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141 }
10142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010144 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 w = PyUnicode_New(
10148 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10149 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010151 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010152 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10153 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010154 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010155 v, 0,
10156 PyUnicode_GET_LENGTH(v)) < 0)
10157 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 Py_DECREF(u);
10159 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010160 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162
Benjamin Peterson29060642009-01-31 22:14:21 +000010163 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164 Py_XDECREF(u);
10165 Py_XDECREF(v);
10166 return NULL;
10167}
10168
Victor Stinnerb0923652011-10-04 01:17:31 +020010169static void
10170unicode_append_inplace(PyObject **p_left, PyObject *right)
10171{
10172 Py_ssize_t left_len, right_len, new_len;
10173#ifdef Py_DEBUG
10174 Py_ssize_t copied;
10175#endif
10176
10177 assert(PyUnicode_IS_READY(*p_left));
10178 assert(PyUnicode_IS_READY(right));
10179
10180 left_len = PyUnicode_GET_LENGTH(*p_left);
10181 right_len = PyUnicode_GET_LENGTH(right);
10182 if (left_len > PY_SSIZE_T_MAX - right_len) {
10183 PyErr_SetString(PyExc_OverflowError,
10184 "strings are too large to concat");
10185 goto error;
10186 }
10187 new_len = left_len + right_len;
10188
10189 /* Now we own the last reference to 'left', so we can resize it
10190 * in-place.
10191 */
10192 if (unicode_resize(p_left, new_len) != 0) {
10193 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10194 * deallocated so it cannot be put back into
10195 * 'variable'. The MemoryError is raised when there
10196 * is no value in 'variable', which might (very
10197 * remotely) be a cause of incompatibilities.
10198 */
10199 goto error;
10200 }
10201 /* copy 'right' into the newly allocated area of 'left' */
10202#ifdef Py_DEBUG
10203 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10204 right, 0,
10205 right_len);
10206 assert(0 <= copied);
10207#else
10208 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10209#endif
10210 return;
10211
10212error:
10213 Py_DECREF(*p_left);
10214 *p_left = NULL;
10215}
10216
Walter Dörwald1ab83302007-05-18 17:15:44 +000010217void
Victor Stinner23e56682011-10-03 03:54:37 +020010218PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010219{
Victor Stinner23e56682011-10-03 03:54:37 +020010220 PyObject *left, *res;
10221
10222 if (p_left == NULL) {
10223 if (!PyErr_Occurred())
10224 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010225 return;
10226 }
Victor Stinner23e56682011-10-03 03:54:37 +020010227 left = *p_left;
10228 if (right == NULL || !PyUnicode_Check(left)) {
10229 if (!PyErr_Occurred())
10230 PyErr_BadInternalCall();
10231 goto error;
10232 }
10233
Victor Stinnere1335c72011-10-04 20:53:03 +020010234 if (PyUnicode_READY(left))
10235 goto error;
10236 if (PyUnicode_READY(right))
10237 goto error;
10238
Victor Stinner23e56682011-10-03 03:54:37 +020010239 if (PyUnicode_CheckExact(left) && left != unicode_empty
10240 && PyUnicode_CheckExact(right) && right != unicode_empty
10241 && unicode_resizable(left)
10242 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10243 || _PyUnicode_WSTR(left) != NULL))
10244 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010245 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10246 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010247 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010248 not so different than duplicating the string. */
10249 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010250 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010251 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010252 if (p_left != NULL)
10253 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010254 return;
10255 }
10256 }
10257
10258 res = PyUnicode_Concat(left, right);
10259 if (res == NULL)
10260 goto error;
10261 Py_DECREF(left);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010262 assert(_PyUnicode_CheckConsistency(res, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010263 *p_left = res;
10264 return;
10265
10266error:
10267 Py_DECREF(*p_left);
10268 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010269}
10270
10271void
10272PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10273{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010274 PyUnicode_Append(pleft, right);
10275 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010276}
10277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010278PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010279 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010281Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010282string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010283interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
10285static PyObject *
10286unicode_count(PyUnicodeObject *self, PyObject *args)
10287{
10288 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010289 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010290 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 int kind1, kind2, kind;
10293 void *buf1, *buf2;
10294 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
Jesus Ceaac451502011-04-20 17:09:23 +020010296 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10297 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010298 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 kind1 = PyUnicode_KIND(self);
10301 kind2 = PyUnicode_KIND(substring);
10302 kind = kind1 > kind2 ? kind1 : kind2;
10303 buf1 = PyUnicode_DATA(self);
10304 buf2 = PyUnicode_DATA(substring);
10305 if (kind1 != kind)
10306 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10307 if (!buf1) {
10308 Py_DECREF(substring);
10309 return NULL;
10310 }
10311 if (kind2 != kind)
10312 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10313 if (!buf2) {
10314 Py_DECREF(substring);
10315 if (kind1 != kind) PyMem_Free(buf1);
10316 return NULL;
10317 }
10318 len1 = PyUnicode_GET_LENGTH(self);
10319 len2 = PyUnicode_GET_LENGTH(substring);
10320
10321 ADJUST_INDICES(start, end, len1);
10322 switch(kind) {
10323 case PyUnicode_1BYTE_KIND:
10324 iresult = ucs1lib_count(
10325 ((Py_UCS1*)buf1) + start, end - start,
10326 buf2, len2, PY_SSIZE_T_MAX
10327 );
10328 break;
10329 case PyUnicode_2BYTE_KIND:
10330 iresult = ucs2lib_count(
10331 ((Py_UCS2*)buf1) + start, end - start,
10332 buf2, len2, PY_SSIZE_T_MAX
10333 );
10334 break;
10335 case PyUnicode_4BYTE_KIND:
10336 iresult = ucs4lib_count(
10337 ((Py_UCS4*)buf1) + start, end - start,
10338 buf2, len2, PY_SSIZE_T_MAX
10339 );
10340 break;
10341 default:
10342 assert(0); iresult = 0;
10343 }
10344
10345 result = PyLong_FromSsize_t(iresult);
10346
10347 if (kind1 != kind)
10348 PyMem_Free(buf1);
10349 if (kind2 != kind)
10350 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351
10352 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010353
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354 return result;
10355}
10356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010357PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010358 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010360Encode S using the codec registered for encoding. Default encoding\n\
10361is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010362handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010363a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10364'xmlcharrefreplace' as well as any other name registered with\n\
10365codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366
10367static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010368unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010370 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371 char *encoding = NULL;
10372 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010373
Benjamin Peterson308d6372009-09-18 21:42:35 +000010374 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10375 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010377 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010378}
10379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010380PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010381 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382\n\
10383Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010384If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385
10386static PyObject*
10387unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10388{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010389 Py_ssize_t i, j, line_pos, src_len, incr;
10390 Py_UCS4 ch;
10391 PyObject *u;
10392 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010394 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010395 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396
10397 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399
Antoine Pitrou22425222011-10-04 19:10:51 +020010400 if (PyUnicode_READY(self) == -1)
10401 return NULL;
10402
Thomas Wouters7e474022000-07-16 12:04:32 +000010403 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010404 src_len = PyUnicode_GET_LENGTH(self);
10405 i = j = line_pos = 0;
10406 kind = PyUnicode_KIND(self);
10407 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010408 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010409 for (; i < src_len; i++) {
10410 ch = PyUnicode_READ(kind, src_data, i);
10411 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010412 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010413 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010414 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010415 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010416 goto overflow;
10417 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010419 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010422 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010423 goto overflow;
10424 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010426 if (ch == '\n' || ch == '\r')
10427 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010429 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010430 if (!found && PyUnicode_CheckExact(self)) {
10431 Py_INCREF((PyObject *) self);
10432 return (PyObject *) self;
10433 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010434
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010436 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437 if (!u)
10438 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010439 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
Antoine Pitroue71d5742011-10-04 15:55:09 +020010441 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442
Antoine Pitroue71d5742011-10-04 15:55:09 +020010443 for (; i < src_len; i++) {
10444 ch = PyUnicode_READ(kind, src_data, i);
10445 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010446 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010447 incr = tabsize - (line_pos % tabsize);
10448 line_pos += incr;
10449 while (incr--) {
10450 PyUnicode_WRITE(kind, dest_data, j, ' ');
10451 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010452 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010454 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010455 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010456 line_pos++;
10457 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010458 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010459 if (ch == '\n' || ch == '\r')
10460 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010462 }
10463 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010464#ifndef DONT_MAKE_RESULT_READY
10465 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 Py_DECREF(u);
10467 return NULL;
10468 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010469#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010470 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010472
Antoine Pitroue71d5742011-10-04 15:55:09 +020010473 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010474 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476}
10477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010478PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480\n\
10481Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010482such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483arguments start and end are interpreted as in slice notation.\n\
10484\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010485Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486
10487static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489{
Jesus Ceaac451502011-04-20 17:09:23 +020010490 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010491 Py_ssize_t start;
10492 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010493 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494
Jesus Ceaac451502011-04-20 17:09:23 +020010495 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10496 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (PyUnicode_READY(self) == -1)
10500 return NULL;
10501 if (PyUnicode_READY(substring) == -1)
10502 return NULL;
10503
10504 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010505 asciilib_find_slice, ucs1lib_find_slice,
10506 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509
10510 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 if (result == -2)
10513 return NULL;
10514
Christian Heimes217cfd12007-12-02 14:31:20 +000010515 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516}
10517
10518static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010519unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010521 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10522 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525}
10526
Guido van Rossumc2504932007-09-18 19:42:40 +000010527/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010528 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010529static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010530unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531{
Guido van Rossumc2504932007-09-18 19:42:40 +000010532 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010533 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 if (_PyUnicode_HASH(self) != -1)
10536 return _PyUnicode_HASH(self);
10537 if (PyUnicode_READY(self) == -1)
10538 return -1;
10539 len = PyUnicode_GET_LENGTH(self);
10540
10541 /* The hash function as a macro, gets expanded three times below. */
10542#define HASH(P) \
10543 x = (Py_uhash_t)*P << 7; \
10544 while (--len >= 0) \
10545 x = (1000003*x) ^ (Py_uhash_t)*P++;
10546
10547 switch (PyUnicode_KIND(self)) {
10548 case PyUnicode_1BYTE_KIND: {
10549 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10550 HASH(c);
10551 break;
10552 }
10553 case PyUnicode_2BYTE_KIND: {
10554 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10555 HASH(s);
10556 break;
10557 }
10558 default: {
10559 Py_UCS4 *l;
10560 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10561 "Impossible switch case in unicode_hash");
10562 l = PyUnicode_4BYTE_DATA(self);
10563 HASH(l);
10564 break;
10565 }
10566 }
10567 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10568
Guido van Rossumc2504932007-09-18 19:42:40 +000010569 if (x == -1)
10570 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010572 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010576PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010579Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580
10581static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010584 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010585 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010586 Py_ssize_t start;
10587 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588
Jesus Ceaac451502011-04-20 17:09:23 +020010589 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10590 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 if (PyUnicode_READY(self) == -1)
10594 return NULL;
10595 if (PyUnicode_READY(substring) == -1)
10596 return NULL;
10597
10598 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010599 asciilib_find_slice, ucs1lib_find_slice,
10600 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
10604 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (result == -2)
10607 return NULL;
10608
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609 if (result < 0) {
10610 PyErr_SetString(PyExc_ValueError, "substring not found");
10611 return NULL;
10612 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613
Christian Heimes217cfd12007-12-02 14:31:20 +000010614 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615}
10616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010617PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010620Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010621at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622
10623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010624unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 Py_ssize_t i, length;
10627 int kind;
10628 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 int cased;
10630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 if (PyUnicode_READY(self) == -1)
10632 return NULL;
10633 length = PyUnicode_GET_LENGTH(self);
10634 kind = PyUnicode_KIND(self);
10635 data = PyUnicode_DATA(self);
10636
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 if (length == 1)
10639 return PyBool_FromLong(
10640 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010642 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010644 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010645
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 for (i = 0; i < length; i++) {
10648 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010649
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10651 return PyBool_FromLong(0);
10652 else if (!cased && Py_UNICODE_ISLOWER(ch))
10653 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010655 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010656}
10657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010658PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010661Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010662at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663
10664static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010665unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 Py_ssize_t i, length;
10668 int kind;
10669 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670 int cased;
10671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (PyUnicode_READY(self) == -1)
10673 return NULL;
10674 length = PyUnicode_GET_LENGTH(self);
10675 kind = PyUnicode_KIND(self);
10676 data = PyUnicode_DATA(self);
10677
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (length == 1)
10680 return PyBool_FromLong(
10681 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010683 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010685 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010686
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 for (i = 0; i < length; i++) {
10689 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010690
Benjamin Peterson29060642009-01-31 22:14:21 +000010691 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10692 return PyBool_FromLong(0);
10693 else if (!cased && Py_UNICODE_ISUPPER(ch))
10694 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010696 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697}
10698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010699PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010700 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010702Return True if S is a titlecased string and there is at least one\n\
10703character in S, i.e. upper- and titlecase characters may only\n\
10704follow uncased characters and lowercase characters only cased ones.\n\
10705Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
10707static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010708unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 Py_ssize_t i, length;
10711 int kind;
10712 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 int cased, previous_is_cased;
10714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 if (PyUnicode_READY(self) == -1)
10716 return NULL;
10717 length = PyUnicode_GET_LENGTH(self);
10718 kind = PyUnicode_KIND(self);
10719 data = PyUnicode_DATA(self);
10720
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 if (length == 1) {
10723 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10724 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10725 (Py_UNICODE_ISUPPER(ch) != 0));
10726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010728 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010731
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732 cased = 0;
10733 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 for (i = 0; i < length; i++) {
10735 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010736
Benjamin Peterson29060642009-01-31 22:14:21 +000010737 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10738 if (previous_is_cased)
10739 return PyBool_FromLong(0);
10740 previous_is_cased = 1;
10741 cased = 1;
10742 }
10743 else if (Py_UNICODE_ISLOWER(ch)) {
10744 if (!previous_is_cased)
10745 return PyBool_FromLong(0);
10746 previous_is_cased = 1;
10747 cased = 1;
10748 }
10749 else
10750 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010752 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753}
10754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010755PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010758Return True if all characters in S are whitespace\n\
10759and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760
10761static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010762unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 Py_ssize_t i, length;
10765 int kind;
10766 void *data;
10767
10768 if (PyUnicode_READY(self) == -1)
10769 return NULL;
10770 length = PyUnicode_GET_LENGTH(self);
10771 kind = PyUnicode_KIND(self);
10772 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 if (length == 1)
10776 return PyBool_FromLong(
10777 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010779 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010781 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 for (i = 0; i < length; i++) {
10784 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010785 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010786 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010788 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789}
10790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010791PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010793\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010794Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010795and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010796
10797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010798unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 Py_ssize_t i, length;
10801 int kind;
10802 void *data;
10803
10804 if (PyUnicode_READY(self) == -1)
10805 return NULL;
10806 length = PyUnicode_GET_LENGTH(self);
10807 kind = PyUnicode_KIND(self);
10808 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010809
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010810 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 if (length == 1)
10812 return PyBool_FromLong(
10813 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010814
10815 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010817 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 for (i = 0; i < length; i++) {
10820 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010821 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010822 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010823 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010824}
10825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010826PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010828\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010829Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010830and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010831
10832static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010833unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 int kind;
10836 void *data;
10837 Py_ssize_t len, i;
10838
10839 if (PyUnicode_READY(self) == -1)
10840 return NULL;
10841
10842 kind = PyUnicode_KIND(self);
10843 data = PyUnicode_DATA(self);
10844 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010845
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010846 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 if (len == 1) {
10848 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10849 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10850 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010851
10852 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010854 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 for (i = 0; i < len; i++) {
10857 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010858 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010859 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010860 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010861 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010862}
10863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010864PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010867Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010868False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869
10870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010871unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 Py_ssize_t i, length;
10874 int kind;
10875 void *data;
10876
10877 if (PyUnicode_READY(self) == -1)
10878 return NULL;
10879 length = PyUnicode_GET_LENGTH(self);
10880 kind = PyUnicode_KIND(self);
10881 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 if (length == 1)
10885 return PyBool_FromLong(
10886 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010888 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 for (i = 0; i < length; i++) {
10893 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010894 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010896 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897}
10898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010899PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010902Return True if all characters in S are digits\n\
10903and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904
10905static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010906unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 Py_ssize_t i, length;
10909 int kind;
10910 void *data;
10911
10912 if (PyUnicode_READY(self) == -1)
10913 return NULL;
10914 length = PyUnicode_GET_LENGTH(self);
10915 kind = PyUnicode_KIND(self);
10916 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 if (length == 1) {
10920 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10921 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010924 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010926 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 for (i = 0; i < length; i++) {
10929 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010930 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010932 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933}
10934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010935PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010936 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010938Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010939False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940
10941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010942unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 Py_ssize_t i, length;
10945 int kind;
10946 void *data;
10947
10948 if (PyUnicode_READY(self) == -1)
10949 return NULL;
10950 length = PyUnicode_GET_LENGTH(self);
10951 kind = PyUnicode_KIND(self);
10952 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 if (length == 1)
10956 return PyBool_FromLong(
10957 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010959 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010961 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 for (i = 0; i < length; i++) {
10964 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010965 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010967 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968}
10969
Martin v. Löwis47383402007-08-15 07:32:56 +000010970int
10971PyUnicode_IsIdentifier(PyObject *self)
10972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 int kind;
10974 void *data;
10975 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010976 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 if (PyUnicode_READY(self) == -1) {
10979 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010980 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 }
10982
10983 /* Special case for empty strings */
10984 if (PyUnicode_GET_LENGTH(self) == 0)
10985 return 0;
10986 kind = PyUnicode_KIND(self);
10987 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010988
10989 /* PEP 3131 says that the first character must be in
10990 XID_Start and subsequent characters in XID_Continue,
10991 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010992 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010993 letters, digits, underscore). However, given the current
10994 definition of XID_Start and XID_Continue, it is sufficient
10995 to check just for these, except that _ must be allowed
10996 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010998 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010999 return 0;
11000
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011001 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011004 return 1;
11005}
11006
11007PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011009\n\
11010Return True if S is a valid identifier according\n\
11011to the language definition.");
11012
11013static PyObject*
11014unicode_isidentifier(PyObject *self)
11015{
11016 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11017}
11018
Georg Brandl559e5d72008-06-11 18:37:52 +000011019PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011021\n\
11022Return True if all characters in S are considered\n\
11023printable in repr() or S is empty, False otherwise.");
11024
11025static PyObject*
11026unicode_isprintable(PyObject *self)
11027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 Py_ssize_t i, length;
11029 int kind;
11030 void *data;
11031
11032 if (PyUnicode_READY(self) == -1)
11033 return NULL;
11034 length = PyUnicode_GET_LENGTH(self);
11035 kind = PyUnicode_KIND(self);
11036 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011037
11038 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 if (length == 1)
11040 return PyBool_FromLong(
11041 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043 for (i = 0; i < length; i++) {
11044 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011045 Py_RETURN_FALSE;
11046 }
11047 }
11048 Py_RETURN_TRUE;
11049}
11050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011051PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011052 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053\n\
11054Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011055iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056
11057static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011058unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011060 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061}
11062
Martin v. Löwis18e16552006-02-15 17:27:45 +000011063static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064unicode_length(PyUnicodeObject *self)
11065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 if (PyUnicode_READY(self) == -1)
11067 return -1;
11068 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069}
11070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011071PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011072 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011074Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011075done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076
11077static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011078unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011080 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 Py_UCS4 fillchar = ' ';
11082
11083 if (PyUnicode_READY(self) == -1)
11084 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011085
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011086 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 return NULL;
11088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090 Py_INCREF(self);
11091 return (PyObject*) self;
11092 }
11093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095}
11096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011097PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011100Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101
11102static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011103unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105 return fixup(self, fixlower);
11106}
11107
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011108#define LEFTSTRIP 0
11109#define RIGHTSTRIP 1
11110#define BOTHSTRIP 2
11111
11112/* Arrays indexed by above */
11113static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11114
11115#define STRIPNAME(i) (stripformat[i]+3)
11116
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011117/* externally visible for str.strip(unicode) */
11118PyObject *
11119_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 void *data;
11122 int kind;
11123 Py_ssize_t i, j, len;
11124 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11127 return NULL;
11128
11129 kind = PyUnicode_KIND(self);
11130 data = PyUnicode_DATA(self);
11131 len = PyUnicode_GET_LENGTH(self);
11132 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11133 PyUnicode_DATA(sepobj),
11134 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011135
Benjamin Peterson14339b62009-01-31 16:36:08 +000011136 i = 0;
11137 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 while (i < len &&
11139 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 i++;
11141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011142 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011143
Benjamin Peterson14339b62009-01-31 16:36:08 +000011144 j = len;
11145 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 do {
11147 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 } while (j >= i &&
11149 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011151 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011152
Victor Stinner12bab6d2011-10-01 01:53:49 +020011153 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154}
11155
11156PyObject*
11157PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11158{
11159 unsigned char *data;
11160 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011161 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162
Victor Stinnerde636f32011-10-01 03:55:54 +020011163 if (PyUnicode_READY(self) == -1)
11164 return NULL;
11165
11166 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11167
Victor Stinner12bab6d2011-10-01 01:53:49 +020011168 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011170 if (PyUnicode_CheckExact(self)) {
11171 Py_INCREF(self);
11172 return self;
11173 }
11174 else
11175 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 }
11177
Victor Stinner12bab6d2011-10-01 01:53:49 +020011178 length = end - start;
11179 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011180 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181
Victor Stinnerde636f32011-10-01 03:55:54 +020011182 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011183 PyErr_SetString(PyExc_IndexError, "string index out of range");
11184 return NULL;
11185 }
11186
Victor Stinnerb9275c12011-10-05 14:01:42 +020011187 if (PyUnicode_IS_ASCII(self)) {
11188 kind = PyUnicode_KIND(self);
11189 data = PyUnicode_1BYTE_DATA(self);
11190 return unicode_fromascii(data + start, length);
11191 }
11192 else {
11193 kind = PyUnicode_KIND(self);
11194 data = PyUnicode_1BYTE_DATA(self);
11195 return PyUnicode_FromKindAndData(kind,
11196 data + PyUnicode_KIND_SIZE(kind, start),
11197 length);
11198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200
11201static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011202do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 int kind;
11205 void *data;
11206 Py_ssize_t len, i, j;
11207
11208 if (PyUnicode_READY(self) == -1)
11209 return NULL;
11210
11211 kind = PyUnicode_KIND(self);
11212 data = PyUnicode_DATA(self);
11213 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011214
Benjamin Peterson14339b62009-01-31 16:36:08 +000011215 i = 0;
11216 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011218 i++;
11219 }
11220 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011221
Benjamin Peterson14339b62009-01-31 16:36:08 +000011222 j = len;
11223 if (striptype != LEFTSTRIP) {
11224 do {
11225 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011227 j++;
11228 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011229
Victor Stinner12bab6d2011-10-01 01:53:49 +020011230 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231}
11232
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011233
11234static PyObject *
11235do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11236{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011237 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011238
Benjamin Peterson14339b62009-01-31 16:36:08 +000011239 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11240 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011241
Benjamin Peterson14339b62009-01-31 16:36:08 +000011242 if (sep != NULL && sep != Py_None) {
11243 if (PyUnicode_Check(sep))
11244 return _PyUnicode_XStrip(self, striptype, sep);
11245 else {
11246 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011247 "%s arg must be None or str",
11248 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011249 return NULL;
11250 }
11251 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011252
Benjamin Peterson14339b62009-01-31 16:36:08 +000011253 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011254}
11255
11256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011257PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011259\n\
11260Return a copy of the string S with leading and trailing\n\
11261whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011262If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011263
11264static PyObject *
11265unicode_strip(PyUnicodeObject *self, PyObject *args)
11266{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011267 if (PyTuple_GET_SIZE(args) == 0)
11268 return do_strip(self, BOTHSTRIP); /* Common case */
11269 else
11270 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011271}
11272
11273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011274PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011276\n\
11277Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011278If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011279
11280static PyObject *
11281unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11282{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011283 if (PyTuple_GET_SIZE(args) == 0)
11284 return do_strip(self, LEFTSTRIP); /* Common case */
11285 else
11286 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011287}
11288
11289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011290PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011292\n\
11293Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011294If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011295
11296static PyObject *
11297unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11298{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011299 if (PyTuple_GET_SIZE(args) == 0)
11300 return do_strip(self, RIGHTSTRIP); /* Common case */
11301 else
11302 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011303}
11304
11305
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011307unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308{
11309 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311
Georg Brandl222de0f2009-04-12 12:01:50 +000011312 if (len < 1) {
11313 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011314 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316
Tim Peters7a29bd52001-09-12 03:03:31 +000011317 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318 /* no repeat, return original string */
11319 Py_INCREF(str);
11320 return (PyObject*) str;
11321 }
Tim Peters8f422462000-09-09 06:13:41 +000011322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 if (PyUnicode_READY(str) == -1)
11324 return NULL;
11325
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011326 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011327 PyErr_SetString(PyExc_OverflowError,
11328 "repeated string is too long");
11329 return NULL;
11330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334 if (!u)
11335 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011336 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (PyUnicode_GET_LENGTH(str) == 1) {
11339 const int kind = PyUnicode_KIND(str);
11340 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11341 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011342 if (kind == PyUnicode_1BYTE_KIND)
11343 memset(to, (unsigned char)fill_char, len);
11344 else {
11345 for (n = 0; n < len; ++n)
11346 PyUnicode_WRITE(kind, to, n, fill_char);
11347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 }
11349 else {
11350 /* number of characters copied this far */
11351 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11352 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11353 char *to = (char *) PyUnicode_DATA(u);
11354 Py_MEMCPY(to, PyUnicode_DATA(str),
11355 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 n = (done <= nchars-done) ? done : nchars-done;
11358 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011359 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361 }
11362
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011363 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364 return (PyObject*) u;
11365}
11366
Alexander Belopolsky40018472011-02-26 01:02:56 +000011367PyObject *
11368PyUnicode_Replace(PyObject *obj,
11369 PyObject *subobj,
11370 PyObject *replobj,
11371 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372{
11373 PyObject *self;
11374 PyObject *str1;
11375 PyObject *str2;
11376 PyObject *result;
11377
11378 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011379 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011382 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011383 Py_DECREF(self);
11384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385 }
11386 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011387 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 Py_DECREF(self);
11389 Py_DECREF(str1);
11390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 Py_DECREF(self);
11394 Py_DECREF(str1);
11395 Py_DECREF(str2);
11396 return result;
11397}
11398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011400 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401\n\
11402Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011403old replaced by new. If the optional argument count is\n\
11404given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
11406static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 PyObject *str1;
11410 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011411 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412 PyObject *result;
11413
Martin v. Löwis18e16552006-02-15 17:27:45 +000011414 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 str1 = PyUnicode_FromObject(str1);
11419 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11420 return NULL;
11421 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011422 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 Py_DECREF(str1);
11424 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
11427 result = replace(self, str1, str2, maxcount);
11428
11429 Py_DECREF(str1);
11430 Py_DECREF(str2);
11431 return result;
11432}
11433
Alexander Belopolsky40018472011-02-26 01:02:56 +000011434static PyObject *
11435unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011437 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 Py_ssize_t isize;
11439 Py_ssize_t osize, squote, dquote, i, o;
11440 Py_UCS4 max, quote;
11441 int ikind, okind;
11442 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011445 return NULL;
11446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 isize = PyUnicode_GET_LENGTH(unicode);
11448 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 /* Compute length of output, quote characters, and
11451 maximum character */
11452 osize = 2; /* quotes */
11453 max = 127;
11454 squote = dquote = 0;
11455 ikind = PyUnicode_KIND(unicode);
11456 for (i = 0; i < isize; i++) {
11457 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11458 switch (ch) {
11459 case '\'': squote++; osize++; break;
11460 case '"': dquote++; osize++; break;
11461 case '\\': case '\t': case '\r': case '\n':
11462 osize += 2; break;
11463 default:
11464 /* Fast-path ASCII */
11465 if (ch < ' ' || ch == 0x7f)
11466 osize += 4; /* \xHH */
11467 else if (ch < 0x7f)
11468 osize++;
11469 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11470 osize++;
11471 max = ch > max ? ch : max;
11472 }
11473 else if (ch < 0x100)
11474 osize += 4; /* \xHH */
11475 else if (ch < 0x10000)
11476 osize += 6; /* \uHHHH */
11477 else
11478 osize += 10; /* \uHHHHHHHH */
11479 }
11480 }
11481
11482 quote = '\'';
11483 if (squote) {
11484 if (dquote)
11485 /* Both squote and dquote present. Use squote,
11486 and escape them */
11487 osize += squote;
11488 else
11489 quote = '"';
11490 }
11491
11492 repr = PyUnicode_New(osize, max);
11493 if (repr == NULL)
11494 return NULL;
11495 okind = PyUnicode_KIND(repr);
11496 odata = PyUnicode_DATA(repr);
11497
11498 PyUnicode_WRITE(okind, odata, 0, quote);
11499 PyUnicode_WRITE(okind, odata, osize-1, quote);
11500
11501 for (i = 0, o = 1; i < isize; i++) {
11502 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011503
11504 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 if ((ch == quote) || (ch == '\\')) {
11506 PyUnicode_WRITE(okind, odata, o++, '\\');
11507 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011508 continue;
11509 }
11510
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011512 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 PyUnicode_WRITE(okind, odata, o++, '\\');
11514 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011515 }
11516 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 PyUnicode_WRITE(okind, odata, o++, '\\');
11518 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011519 }
11520 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 PyUnicode_WRITE(okind, odata, o++, '\\');
11522 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011523 }
11524
11525 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011526 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 PyUnicode_WRITE(okind, odata, o++, '\\');
11528 PyUnicode_WRITE(okind, odata, o++, 'x');
11529 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11530 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011531 }
11532
Georg Brandl559e5d72008-06-11 18:37:52 +000011533 /* Copy ASCII characters as-is */
11534 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011536 }
11537
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011539 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011540 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011541 (categories Z* and C* except ASCII space)
11542 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011544 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (ch <= 0xff) {
11546 PyUnicode_WRITE(okind, odata, o++, '\\');
11547 PyUnicode_WRITE(okind, odata, o++, 'x');
11548 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11549 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011550 }
11551 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 else if (ch >= 0x10000) {
11553 PyUnicode_WRITE(okind, odata, o++, '\\');
11554 PyUnicode_WRITE(okind, odata, o++, 'U');
11555 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11556 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11557 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11558 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11559 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11560 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11561 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11562 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011563 }
11564 /* Map 16-bit characters to '\uxxxx' */
11565 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 PyUnicode_WRITE(okind, odata, o++, '\\');
11567 PyUnicode_WRITE(okind, odata, o++, 'u');
11568 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11569 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11570 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11571 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011572 }
11573 }
11574 /* Copy characters as-is */
11575 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011577 }
11578 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 /* Closing quote already added at the beginning */
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011581 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011582 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583}
11584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011585PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587\n\
11588Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011589such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590arguments start and end are interpreted as in slice notation.\n\
11591\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011592Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
11594static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596{
Jesus Ceaac451502011-04-20 17:09:23 +020011597 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011598 Py_ssize_t start;
11599 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011600 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
Jesus Ceaac451502011-04-20 17:09:23 +020011602 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11603 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 if (PyUnicode_READY(self) == -1)
11607 return NULL;
11608 if (PyUnicode_READY(substring) == -1)
11609 return NULL;
11610
11611 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011612 asciilib_rfind_slice, ucs1lib_rfind_slice,
11613 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011615 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
11617 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 if (result == -2)
11620 return NULL;
11621
Christian Heimes217cfd12007-12-02 14:31:20 +000011622 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623}
11624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011625PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Jesus Ceaac451502011-04-20 17:09:23 +020011633 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011634 Py_ssize_t start;
11635 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011636 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
Jesus Ceaac451502011-04-20 17:09:23 +020011638 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11639 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 if (PyUnicode_READY(self) == -1)
11643 return NULL;
11644 if (PyUnicode_READY(substring) == -1)
11645 return NULL;
11646
11647 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011648 asciilib_rfind_slice, ucs1lib_rfind_slice,
11649 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011651 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
11653 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 if (result == -2)
11656 return NULL;
11657
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658 if (result < 0) {
11659 PyErr_SetString(PyExc_ValueError, "substring not found");
11660 return NULL;
11661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662
Christian Heimes217cfd12007-12-02 14:31:20 +000011663 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664}
11665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011666PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011669Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011670done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
11672static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011673unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011675 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 Py_UCS4 fillchar = ' ';
11677
Victor Stinnere9a29352011-10-01 02:14:59 +020011678 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011680
Victor Stinnere9a29352011-10-01 02:14:59 +020011681 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682 return NULL;
11683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 Py_INCREF(self);
11686 return (PyObject*) self;
11687 }
11688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690}
11691
Alexander Belopolsky40018472011-02-26 01:02:56 +000011692PyObject *
11693PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694{
11695 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011696
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 s = PyUnicode_FromObject(s);
11698 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011699 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 if (sep != NULL) {
11701 sep = PyUnicode_FromObject(sep);
11702 if (sep == NULL) {
11703 Py_DECREF(s);
11704 return NULL;
11705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706 }
11707
Victor Stinner9310abb2011-10-05 00:59:23 +020011708 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709
11710 Py_DECREF(s);
11711 Py_XDECREF(sep);
11712 return result;
11713}
11714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011715PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717\n\
11718Return a list of the words in S, using sep as the\n\
11719delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011720splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011721whitespace string is a separator and empty strings are\n\
11722removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
11724static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011725unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726{
11727 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011728 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
Martin v. Löwis18e16552006-02-15 17:27:45 +000011730 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 return NULL;
11732
11733 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011736 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739}
11740
Thomas Wouters477c8d52006-05-27 19:21:47 +000011741PyObject *
11742PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11743{
11744 PyObject* str_obj;
11745 PyObject* sep_obj;
11746 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 int kind1, kind2, kind;
11748 void *buf1 = NULL, *buf2 = NULL;
11749 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011750
11751 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011752 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011754 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011756 Py_DECREF(str_obj);
11757 return NULL;
11758 }
11759
Victor Stinner14f8f022011-10-05 20:58:25 +020011760 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011762 kind = Py_MAX(kind1, kind2);
11763 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011765 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 if (!buf1)
11767 goto onError;
11768 buf2 = PyUnicode_DATA(sep_obj);
11769 if (kind2 != kind)
11770 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11771 if (!buf2)
11772 goto onError;
11773 len1 = PyUnicode_GET_LENGTH(str_obj);
11774 len2 = PyUnicode_GET_LENGTH(sep_obj);
11775
Victor Stinner14f8f022011-10-05 20:58:25 +020011776 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011778 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11779 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11780 else
11781 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 break;
11783 case PyUnicode_2BYTE_KIND:
11784 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11785 break;
11786 case PyUnicode_4BYTE_KIND:
11787 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11788 break;
11789 default:
11790 assert(0);
11791 out = 0;
11792 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011793
11794 Py_DECREF(sep_obj);
11795 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 if (kind1 != kind)
11797 PyMem_Free(buf1);
11798 if (kind2 != kind)
11799 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011800
11801 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 onError:
11803 Py_DECREF(sep_obj);
11804 Py_DECREF(str_obj);
11805 if (kind1 != kind && buf1)
11806 PyMem_Free(buf1);
11807 if (kind2 != kind && buf2)
11808 PyMem_Free(buf2);
11809 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011810}
11811
11812
11813PyObject *
11814PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11815{
11816 PyObject* str_obj;
11817 PyObject* sep_obj;
11818 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 int kind1, kind2, kind;
11820 void *buf1 = NULL, *buf2 = NULL;
11821 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011822
11823 str_obj = PyUnicode_FromObject(str_in);
11824 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011826 sep_obj = PyUnicode_FromObject(sep_in);
11827 if (!sep_obj) {
11828 Py_DECREF(str_obj);
11829 return NULL;
11830 }
11831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 kind1 = PyUnicode_KIND(str_in);
11833 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011834 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 buf1 = PyUnicode_DATA(str_in);
11836 if (kind1 != kind)
11837 buf1 = _PyUnicode_AsKind(str_in, kind);
11838 if (!buf1)
11839 goto onError;
11840 buf2 = PyUnicode_DATA(sep_obj);
11841 if (kind2 != kind)
11842 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11843 if (!buf2)
11844 goto onError;
11845 len1 = PyUnicode_GET_LENGTH(str_obj);
11846 len2 = PyUnicode_GET_LENGTH(sep_obj);
11847
11848 switch(PyUnicode_KIND(str_in)) {
11849 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011850 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11851 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11852 else
11853 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 break;
11855 case PyUnicode_2BYTE_KIND:
11856 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11857 break;
11858 case PyUnicode_4BYTE_KIND:
11859 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11860 break;
11861 default:
11862 assert(0);
11863 out = 0;
11864 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011865
11866 Py_DECREF(sep_obj);
11867 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 if (kind1 != kind)
11869 PyMem_Free(buf1);
11870 if (kind2 != kind)
11871 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011872
11873 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 onError:
11875 Py_DECREF(sep_obj);
11876 Py_DECREF(str_obj);
11877 if (kind1 != kind && buf1)
11878 PyMem_Free(buf1);
11879 if (kind2 != kind && buf2)
11880 PyMem_Free(buf2);
11881 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011882}
11883
11884PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011885 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011886\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011887Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011888the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011889found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011890
11891static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011892unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011893{
Victor Stinner9310abb2011-10-05 00:59:23 +020011894 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011895}
11896
11897PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011898 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011899\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011900Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011901the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011902separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011903
11904static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011905unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011906{
Victor Stinner9310abb2011-10-05 00:59:23 +020011907 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011908}
11909
Alexander Belopolsky40018472011-02-26 01:02:56 +000011910PyObject *
11911PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011912{
11913 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011914
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011915 s = PyUnicode_FromObject(s);
11916 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011917 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 if (sep != NULL) {
11919 sep = PyUnicode_FromObject(sep);
11920 if (sep == NULL) {
11921 Py_DECREF(s);
11922 return NULL;
11923 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011924 }
11925
Victor Stinner9310abb2011-10-05 00:59:23 +020011926 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011927
11928 Py_DECREF(s);
11929 Py_XDECREF(sep);
11930 return result;
11931}
11932
11933PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011935\n\
11936Return a list of the words in S, using sep as the\n\
11937delimiter string, starting at the end of the string and\n\
11938working to the front. If maxsplit is given, at most maxsplit\n\
11939splits are done. If sep is not specified, any whitespace string\n\
11940is a separator.");
11941
11942static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011943unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011944{
11945 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011946 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011947
Martin v. Löwis18e16552006-02-15 17:27:45 +000011948 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011949 return NULL;
11950
11951 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011953 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011954 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011955 else
Victor Stinner9310abb2011-10-05 00:59:23 +020011956 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011957}
11958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011959PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961\n\
11962Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011963Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011964is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
11966static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011967unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011969 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011970 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011972 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11973 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 return NULL;
11975
Guido van Rossum86662912000-04-11 15:38:46 +000011976 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977}
11978
11979static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011980PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981{
Walter Dörwald346737f2007-05-31 10:44:43 +000011982 if (PyUnicode_CheckExact(self)) {
11983 Py_INCREF(self);
11984 return self;
11985 } else
11986 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011987 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988}
11989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011990PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992\n\
11993Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011994and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995
11996static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011997unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999 return fixup(self, fixswapcase);
12000}
12001
Georg Brandlceee0772007-11-27 23:48:05 +000012002PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012004\n\
12005Return a translation table usable for str.translate().\n\
12006If there is only one argument, it must be a dictionary mapping Unicode\n\
12007ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012008Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012009If there are two arguments, they must be strings of equal length, and\n\
12010in the resulting dictionary, each character in x will be mapped to the\n\
12011character at the same position in y. If there is a third argument, it\n\
12012must be a string, whose characters will be mapped to None in the result.");
12013
12014static PyObject*
12015unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12016{
12017 PyObject *x, *y = NULL, *z = NULL;
12018 PyObject *new = NULL, *key, *value;
12019 Py_ssize_t i = 0;
12020 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012021
Georg Brandlceee0772007-11-27 23:48:05 +000012022 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12023 return NULL;
12024 new = PyDict_New();
12025 if (!new)
12026 return NULL;
12027 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 int x_kind, y_kind, z_kind;
12029 void *x_data, *y_data, *z_data;
12030
Georg Brandlceee0772007-11-27 23:48:05 +000012031 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012032 if (!PyUnicode_Check(x)) {
12033 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12034 "be a string if there is a second argument");
12035 goto err;
12036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012038 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12039 "arguments must have equal length");
12040 goto err;
12041 }
12042 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 x_kind = PyUnicode_KIND(x);
12044 y_kind = PyUnicode_KIND(y);
12045 x_data = PyUnicode_DATA(x);
12046 y_data = PyUnicode_DATA(y);
12047 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12048 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12049 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012050 if (!key || !value)
12051 goto err;
12052 res = PyDict_SetItem(new, key, value);
12053 Py_DECREF(key);
12054 Py_DECREF(value);
12055 if (res < 0)
12056 goto err;
12057 }
12058 /* create entries for deleting chars in z */
12059 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 z_kind = PyUnicode_KIND(z);
12061 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012062 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012064 if (!key)
12065 goto err;
12066 res = PyDict_SetItem(new, key, Py_None);
12067 Py_DECREF(key);
12068 if (res < 0)
12069 goto err;
12070 }
12071 }
12072 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 int kind;
12074 void *data;
12075
Georg Brandlceee0772007-11-27 23:48:05 +000012076 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012077 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012078 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12079 "to maketrans it must be a dict");
12080 goto err;
12081 }
12082 /* copy entries into the new dict, converting string keys to int keys */
12083 while (PyDict_Next(x, &i, &key, &value)) {
12084 if (PyUnicode_Check(key)) {
12085 /* convert string keys to integer keys */
12086 PyObject *newkey;
12087 if (PyUnicode_GET_SIZE(key) != 1) {
12088 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12089 "table must be of length 1");
12090 goto err;
12091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 kind = PyUnicode_KIND(key);
12093 data = PyUnicode_DATA(key);
12094 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012095 if (!newkey)
12096 goto err;
12097 res = PyDict_SetItem(new, newkey, value);
12098 Py_DECREF(newkey);
12099 if (res < 0)
12100 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012101 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012102 /* just keep integer keys */
12103 if (PyDict_SetItem(new, key, value) < 0)
12104 goto err;
12105 } else {
12106 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12107 "be strings or integers");
12108 goto err;
12109 }
12110 }
12111 }
12112 return new;
12113 err:
12114 Py_DECREF(new);
12115 return NULL;
12116}
12117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012118PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120\n\
12121Return a copy of the string S, where all characters have been mapped\n\
12122through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012123Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012124Unmapped characters are left untouched. Characters mapped to None\n\
12125are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
12127static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131}
12132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012133PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012136Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137
12138static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012139unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141 return fixup(self, fixupper);
12142}
12143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012144PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012147Pad a numeric string S with zeros on the left, to fill a field\n\
12148of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149
12150static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012151unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012153 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012154 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012155 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 int kind;
12157 void *data;
12158 Py_UCS4 chr;
12159
12160 if (PyUnicode_READY(self) == -1)
12161 return NULL;
12162
Martin v. Löwis18e16552006-02-15 17:27:45 +000012163 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164 return NULL;
12165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012167 if (PyUnicode_CheckExact(self)) {
12168 Py_INCREF(self);
12169 return (PyObject*) self;
12170 }
12171 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012172 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173 }
12174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
12177 u = pad(self, fill, 0, '0');
12178
Walter Dörwald068325e2002-04-15 13:36:47 +000012179 if (u == NULL)
12180 return NULL;
12181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 kind = PyUnicode_KIND(u);
12183 data = PyUnicode_DATA(u);
12184 chr = PyUnicode_READ(kind, data, fill);
12185
12186 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 PyUnicode_WRITE(kind, data, 0, chr);
12189 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190 }
12191
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012192 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 return (PyObject*) u;
12194}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
12196#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012197static PyObject *
12198unicode__decimal2ascii(PyObject *self)
12199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012201}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202#endif
12203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012204PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012205 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012207Return True if S starts with the specified prefix, False otherwise.\n\
12208With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012209With optional end, stop comparing S at that position.\n\
12210prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211
12212static PyObject *
12213unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012216 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012218 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012219 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012220 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
Jesus Ceaac451502011-04-20 17:09:23 +020012222 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012224 if (PyTuple_Check(subobj)) {
12225 Py_ssize_t i;
12226 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12227 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012228 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012229 if (substring == NULL)
12230 return NULL;
12231 result = tailmatch(self, substring, start, end, -1);
12232 Py_DECREF(substring);
12233 if (result) {
12234 Py_RETURN_TRUE;
12235 }
12236 }
12237 /* nothing matched */
12238 Py_RETURN_FALSE;
12239 }
12240 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012241 if (substring == NULL) {
12242 if (PyErr_ExceptionMatches(PyExc_TypeError))
12243 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12244 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012246 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012247 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012249 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250}
12251
12252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012253PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012256Return True if S ends with the specified suffix, False otherwise.\n\
12257With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012258With optional end, stop comparing S at that position.\n\
12259suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
12261static PyObject *
12262unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012265 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012267 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012268 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012269 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270
Jesus Ceaac451502011-04-20 17:09:23 +020012271 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012273 if (PyTuple_Check(subobj)) {
12274 Py_ssize_t i;
12275 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12276 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012278 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012280 result = tailmatch(self, substring, start, end, +1);
12281 Py_DECREF(substring);
12282 if (result) {
12283 Py_RETURN_TRUE;
12284 }
12285 }
12286 Py_RETURN_FALSE;
12287 }
12288 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012289 if (substring == NULL) {
12290 if (PyErr_ExceptionMatches(PyExc_TypeError))
12291 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12292 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012294 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012295 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012297 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298}
12299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012301
12302PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012304\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012305Return a formatted version of S, using substitutions from args and kwargs.\n\
12306The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012307
Eric Smith27bbca62010-11-04 17:06:58 +000012308PyDoc_STRVAR(format_map__doc__,
12309 "S.format_map(mapping) -> str\n\
12310\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012311Return a formatted version of S, using substitutions from mapping.\n\
12312The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012313
Eric Smith4a7d76d2008-05-30 18:10:19 +000012314static PyObject *
12315unicode__format__(PyObject* self, PyObject* args)
12316{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012317 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012318
12319 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12320 return NULL;
12321
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012322 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012324 if (out != NULL)
12325 assert(_PyUnicode_CheckConsistency(out, 1));
12326 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012327}
12328
Eric Smith8c663262007-08-25 02:26:07 +000012329PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012331\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012332Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012333
12334static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012335unicode__sizeof__(PyUnicodeObject *v)
12336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 Py_ssize_t size;
12338
12339 /* If it's a compact object, account for base structure +
12340 character data. */
12341 if (PyUnicode_IS_COMPACT_ASCII(v))
12342 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12343 else if (PyUnicode_IS_COMPACT(v))
12344 size = sizeof(PyCompactUnicodeObject) +
12345 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12346 else {
12347 /* If it is a two-block object, account for base object, and
12348 for character block if present. */
12349 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012350 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 size += (PyUnicode_GET_LENGTH(v) + 1) *
12352 PyUnicode_CHARACTER_SIZE(v);
12353 }
12354 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012355 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012356 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012358 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012359 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360
12361 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012362}
12363
12364PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012366
12367static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012368unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012369{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012370 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 if (!copy)
12372 return NULL;
12373 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012374}
12375
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376static PyMethodDef unicode_methods[] = {
12377
12378 /* Order is according to common usage: often used methods should
12379 appear first, since lookup is done sequentially. */
12380
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012381 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012382 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12383 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012384 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012385 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12386 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12387 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12388 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12389 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12390 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12391 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012392 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012393 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12394 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12395 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012396 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012397 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12398 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12399 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012400 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012401 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012402 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012403 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012404 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12405 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12406 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12407 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12408 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12409 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12410 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12411 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12412 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12413 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12414 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12415 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12416 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12417 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012418 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012419 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012420 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012421 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012422 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012423 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012424 {"maketrans", (PyCFunction) unicode_maketrans,
12425 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012426 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012427#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012428 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012429#endif
12430
12431#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012432 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012433 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434#endif
12435
Benjamin Peterson14339b62009-01-31 16:36:08 +000012436 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437 {NULL, NULL}
12438};
12439
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012440static PyObject *
12441unicode_mod(PyObject *v, PyObject *w)
12442{
Brian Curtindfc80e32011-08-10 20:28:54 -050012443 if (!PyUnicode_Check(v))
12444 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012446}
12447
12448static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012449 0, /*nb_add*/
12450 0, /*nb_subtract*/
12451 0, /*nb_multiply*/
12452 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012453};
12454
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012456 (lenfunc) unicode_length, /* sq_length */
12457 PyUnicode_Concat, /* sq_concat */
12458 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12459 (ssizeargfunc) unicode_getitem, /* sq_item */
12460 0, /* sq_slice */
12461 0, /* sq_ass_item */
12462 0, /* sq_ass_slice */
12463 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464};
12465
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012466static PyObject*
12467unicode_subscript(PyUnicodeObject* self, PyObject* item)
12468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 if (PyUnicode_READY(self) == -1)
12470 return NULL;
12471
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012472 if (PyIndex_Check(item)) {
12473 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012474 if (i == -1 && PyErr_Occurred())
12475 return NULL;
12476 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012478 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012479 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012480 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012481 PyObject *result;
12482 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012483 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012484 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012488 return NULL;
12489 }
12490
12491 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 return PyUnicode_New(0, 0);
12493 } else if (start == 0 && step == 1 &&
12494 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012495 PyUnicode_CheckExact(self)) {
12496 Py_INCREF(self);
12497 return (PyObject *)self;
12498 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012499 return PyUnicode_Substring((PyObject*)self,
12500 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012501 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012502 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012503 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012504 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012505 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012506 src_data = PyUnicode_DATA(self);
12507 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12508 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012509 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012510 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012511 if (max_char >= kind_limit)
12512 break;
12513 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012514 }
12515 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012516 if (result == NULL)
12517 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012518 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012519 dest_data = PyUnicode_DATA(result);
12520
12521 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012522 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12523 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012524 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012525 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012526 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012527 } else {
12528 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12529 return NULL;
12530 }
12531}
12532
12533static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012534 (lenfunc)unicode_length, /* mp_length */
12535 (binaryfunc)unicode_subscript, /* mp_subscript */
12536 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012537};
12538
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540/* Helpers for PyUnicode_Format() */
12541
12542static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012543getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012545 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012547 (*p_argidx)++;
12548 if (arglen < 0)
12549 return args;
12550 else
12551 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 }
12553 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555 return NULL;
12556}
12557
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012558/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012560static PyObject *
12561formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012563 char *p;
12564 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012566
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567 x = PyFloat_AsDouble(v);
12568 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012569 return NULL;
12570
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012573
Eric Smith0923d1d2009-04-16 20:16:10 +000012574 p = PyOS_double_to_string(x, type, prec,
12575 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012576 if (p == NULL)
12577 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012579 PyMem_Free(p);
12580 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581}
12582
Tim Peters38fd5b62000-09-21 05:43:11 +000012583static PyObject*
12584formatlong(PyObject *val, int flags, int prec, int type)
12585{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012586 char *buf;
12587 int len;
12588 PyObject *str; /* temporary string object. */
12589 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012590
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12592 if (!str)
12593 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012595 Py_DECREF(str);
12596 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012597}
12598
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012601 size_t buflen,
12602 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012604 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012605 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 if (PyUnicode_GET_LENGTH(v) == 1) {
12607 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 buf[1] = '\0';
12609 return 1;
12610 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 goto onError;
12612 }
12613 else {
12614 /* Integer input truncated to a character */
12615 long x;
12616 x = PyLong_AsLong(v);
12617 if (x == -1 && PyErr_Occurred())
12618 goto onError;
12619
12620 if (x < 0 || x > 0x10ffff) {
12621 PyErr_SetString(PyExc_OverflowError,
12622 "%c arg not in range(0x110000)");
12623 return -1;
12624 }
12625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012627 buf[1] = '\0';
12628 return 1;
12629 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012630
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012632 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012634 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635}
12636
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012637/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012638 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012639*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012640#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012641
Alexander Belopolsky40018472011-02-26 01:02:56 +000012642PyObject *
12643PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 void *fmt;
12646 int fmtkind;
12647 PyObject *result;
12648 Py_UCS4 *res, *res0;
12649 Py_UCS4 max;
12650 int kind;
12651 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012655
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 PyErr_BadInternalCall();
12658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12661 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 fmt = PyUnicode_DATA(uformat);
12664 fmtkind = PyUnicode_KIND(uformat);
12665 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12666 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667
12668 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12670 if (res0 == NULL) {
12671 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674
12675 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 arglen = PyTuple_Size(args);
12677 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678 }
12679 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 arglen = -1;
12681 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012683 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012684 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686
12687 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 if (--rescnt < 0) {
12690 rescnt = fmtcnt + 100;
12691 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12693 if (res0 == NULL){
12694 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 }
12697 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012701 }
12702 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 /* Got a format specifier */
12704 int flags = 0;
12705 Py_ssize_t width = -1;
12706 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 Py_UCS4 c = '\0';
12708 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012709 int isnumok;
12710 PyObject *v = NULL;
12711 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 void *pbuf;
12713 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 Py_ssize_t len, len1;
12716 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 fmtpos++;
12719 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12720 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 Py_ssize_t keylen;
12722 PyObject *key;
12723 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012724
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 if (dict == NULL) {
12726 PyErr_SetString(PyExc_TypeError,
12727 "format requires a mapping");
12728 goto onError;
12729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012733 /* Skip over balanced parentheses */
12734 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012736 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 if (fmtcnt < 0 || pcount > 0) {
12743 PyErr_SetString(PyExc_ValueError,
12744 "incomplete format key");
12745 goto onError;
12746 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012747 key = PyUnicode_Substring((PyObject*)uformat,
12748 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 if (key == NULL)
12750 goto onError;
12751 if (args_owned) {
12752 Py_DECREF(args);
12753 args_owned = 0;
12754 }
12755 args = PyObject_GetItem(dict, key);
12756 Py_DECREF(key);
12757 if (args == NULL) {
12758 goto onError;
12759 }
12760 args_owned = 1;
12761 arglen = -1;
12762 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 case '-': flags |= F_LJUST; continue;
12767 case '+': flags |= F_SIGN; continue;
12768 case ' ': flags |= F_BLANK; continue;
12769 case '#': flags |= F_ALT; continue;
12770 case '0': flags |= F_ZERO; continue;
12771 }
12772 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012773 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 if (c == '*') {
12775 v = getnextarg(args, arglen, &argidx);
12776 if (v == NULL)
12777 goto onError;
12778 if (!PyLong_Check(v)) {
12779 PyErr_SetString(PyExc_TypeError,
12780 "* wants int");
12781 goto onError;
12782 }
12783 width = PyLong_AsLong(v);
12784 if (width == -1 && PyErr_Occurred())
12785 goto onError;
12786 if (width < 0) {
12787 flags |= F_LJUST;
12788 width = -width;
12789 }
12790 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 }
12793 else if (c >= '0' && c <= '9') {
12794 width = c - '0';
12795 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 if (c < '0' || c > '9')
12798 break;
12799 if ((width*10) / 10 != width) {
12800 PyErr_SetString(PyExc_ValueError,
12801 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012802 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 }
12804 width = width*10 + (c - '0');
12805 }
12806 }
12807 if (c == '.') {
12808 prec = 0;
12809 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 if (c == '*') {
12812 v = getnextarg(args, arglen, &argidx);
12813 if (v == NULL)
12814 goto onError;
12815 if (!PyLong_Check(v)) {
12816 PyErr_SetString(PyExc_TypeError,
12817 "* wants int");
12818 goto onError;
12819 }
12820 prec = PyLong_AsLong(v);
12821 if (prec == -1 && PyErr_Occurred())
12822 goto onError;
12823 if (prec < 0)
12824 prec = 0;
12825 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 }
12828 else if (c >= '0' && c <= '9') {
12829 prec = c - '0';
12830 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 if (c < '0' || c > '9')
12833 break;
12834 if ((prec*10) / 10 != prec) {
12835 PyErr_SetString(PyExc_ValueError,
12836 "prec too big");
12837 goto onError;
12838 }
12839 prec = prec*10 + (c - '0');
12840 }
12841 }
12842 } /* prec */
12843 if (fmtcnt >= 0) {
12844 if (c == 'h' || c == 'l' || c == 'L') {
12845 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 }
12848 }
12849 if (fmtcnt < 0) {
12850 PyErr_SetString(PyExc_ValueError,
12851 "incomplete format");
12852 goto onError;
12853 }
12854 if (c != '%') {
12855 v = getnextarg(args, arglen, &argidx);
12856 if (v == NULL)
12857 goto onError;
12858 }
12859 sign = 0;
12860 fill = ' ';
12861 switch (c) {
12862
12863 case '%':
12864 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 len = 1;
12869 break;
12870
12871 case 's':
12872 case 'r':
12873 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012874 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012875 temp = v;
12876 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012877 }
12878 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 if (c == 's')
12880 temp = PyObject_Str(v);
12881 else if (c == 'r')
12882 temp = PyObject_Repr(v);
12883 else
12884 temp = PyObject_ASCII(v);
12885 if (temp == NULL)
12886 goto onError;
12887 if (PyUnicode_Check(temp))
12888 /* nothing to do */;
12889 else {
12890 Py_DECREF(temp);
12891 PyErr_SetString(PyExc_TypeError,
12892 "%s argument has non-string str()");
12893 goto onError;
12894 }
12895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 if (PyUnicode_READY(temp) == -1) {
12897 Py_CLEAR(temp);
12898 goto onError;
12899 }
12900 pbuf = PyUnicode_DATA(temp);
12901 kind = PyUnicode_KIND(temp);
12902 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012903 if (prec >= 0 && len > prec)
12904 len = prec;
12905 break;
12906
12907 case 'i':
12908 case 'd':
12909 case 'u':
12910 case 'o':
12911 case 'x':
12912 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012913 isnumok = 0;
12914 if (PyNumber_Check(v)) {
12915 PyObject *iobj=NULL;
12916
12917 if (PyLong_Check(v)) {
12918 iobj = v;
12919 Py_INCREF(iobj);
12920 }
12921 else {
12922 iobj = PyNumber_Long(v);
12923 }
12924 if (iobj!=NULL) {
12925 if (PyLong_Check(iobj)) {
12926 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012927 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 Py_DECREF(iobj);
12929 if (!temp)
12930 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 if (PyUnicode_READY(temp) == -1) {
12932 Py_CLEAR(temp);
12933 goto onError;
12934 }
12935 pbuf = PyUnicode_DATA(temp);
12936 kind = PyUnicode_KIND(temp);
12937 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 sign = 1;
12939 }
12940 else {
12941 Py_DECREF(iobj);
12942 }
12943 }
12944 }
12945 if (!isnumok) {
12946 PyErr_Format(PyExc_TypeError,
12947 "%%%c format: a number is required, "
12948 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12949 goto onError;
12950 }
12951 if (flags & F_ZERO)
12952 fill = '0';
12953 break;
12954
12955 case 'e':
12956 case 'E':
12957 case 'f':
12958 case 'F':
12959 case 'g':
12960 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012961 temp = formatfloat(v, flags, prec, c);
12962 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 if (PyUnicode_READY(temp) == -1) {
12965 Py_CLEAR(temp);
12966 goto onError;
12967 }
12968 pbuf = PyUnicode_DATA(temp);
12969 kind = PyUnicode_KIND(temp);
12970 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012971 sign = 1;
12972 if (flags & F_ZERO)
12973 fill = '0';
12974 break;
12975
12976 case 'c':
12977 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012979 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012980 if (len < 0)
12981 goto onError;
12982 break;
12983
12984 default:
12985 PyErr_Format(PyExc_ValueError,
12986 "unsupported format character '%c' (0x%x) "
12987 "at index %zd",
12988 (31<=c && c<=126) ? (char)c : '?',
12989 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 goto onError;
12992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 /* pbuf is initialized here. */
12994 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12997 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12998 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012999 len--;
13000 }
13001 else if (flags & F_SIGN)
13002 sign = '+';
13003 else if (flags & F_BLANK)
13004 sign = ' ';
13005 else
13006 sign = 0;
13007 }
13008 if (width < len)
13009 width = len;
13010 if (rescnt - (sign != 0) < width) {
13011 reslen -= rescnt;
13012 rescnt = width + fmtcnt + 100;
13013 reslen += rescnt;
13014 if (reslen < 0) {
13015 Py_XDECREF(temp);
13016 PyErr_NoMemory();
13017 goto onError;
13018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13020 if (res0 == 0) {
13021 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013022 Py_XDECREF(temp);
13023 goto onError;
13024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013026 }
13027 if (sign) {
13028 if (fill != ' ')
13029 *res++ = sign;
13030 rescnt--;
13031 if (width > len)
13032 width--;
13033 }
13034 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13036 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013037 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13039 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 }
13041 rescnt -= 2;
13042 width -= 2;
13043 if (width < 0)
13044 width = 0;
13045 len -= 2;
13046 }
13047 if (width > len && !(flags & F_LJUST)) {
13048 do {
13049 --rescnt;
13050 *res++ = fill;
13051 } while (--width > len);
13052 }
13053 if (fill == ' ') {
13054 if (sign)
13055 *res++ = sign;
13056 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13058 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13059 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13060 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013061 }
13062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 /* Copy all characters, preserving len */
13064 len1 = len;
13065 while (len1--) {
13066 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13067 rescnt--;
13068 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 while (--width >= len) {
13070 --rescnt;
13071 *res++ = ' ';
13072 }
13073 if (dict && (argidx < arglen) && c != '%') {
13074 PyErr_SetString(PyExc_TypeError,
13075 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013076 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013077 goto onError;
13078 }
13079 Py_XDECREF(temp);
13080 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081 } /* until end */
13082 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013083 PyErr_SetString(PyExc_TypeError,
13084 "not all arguments converted during string formatting");
13085 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086 }
13087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013088
13089 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13090 if (*res > max)
13091 max = *res;
13092 result = PyUnicode_New(reslen - rescnt, max);
13093 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013094 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013095 kind = PyUnicode_KIND(result);
13096 for (res = res0; res < res0+reslen-rescnt; res++)
13097 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13098 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013100 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101 }
13102 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013103 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104 return (PyObject *)result;
13105
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108 Py_DECREF(uformat);
13109 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 }
13112 return NULL;
13113}
13114
Jeremy Hylton938ace62002-07-17 16:30:39 +000013115static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013116unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13117
Tim Peters6d6c1a32001-08-02 04:15:00 +000013118static PyObject *
13119unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13120{
Benjamin Peterson29060642009-01-31 22:14:21 +000013121 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 static char *kwlist[] = {"object", "encoding", "errors", 0};
13123 char *encoding = NULL;
13124 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013125
Benjamin Peterson14339b62009-01-31 16:36:08 +000013126 if (type != &PyUnicode_Type)
13127 return unicode_subtype_new(type, args, kwds);
13128 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013130 return NULL;
13131 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013133 if (encoding == NULL && errors == NULL)
13134 return PyObject_Str(x);
13135 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013137}
13138
Guido van Rossume023fe02001-08-30 03:12:59 +000013139static PyObject *
13140unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13141{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013142 PyUnicodeObject *unicode, *self;
13143 Py_ssize_t length, char_size;
13144 int share_wstr, share_utf8;
13145 unsigned int kind;
13146 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013147
Benjamin Peterson14339b62009-01-31 16:36:08 +000013148 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013149
13150 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13151 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013152 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013153 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013154 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013155 return NULL;
13156
13157 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13158 if (self == NULL) {
13159 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013160 return NULL;
13161 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013162 kind = PyUnicode_KIND(unicode);
13163 length = PyUnicode_GET_LENGTH(unicode);
13164
13165 _PyUnicode_LENGTH(self) = length;
13166 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13167 _PyUnicode_STATE(self).interned = 0;
13168 _PyUnicode_STATE(self).kind = kind;
13169 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013170 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013171 _PyUnicode_STATE(self).ready = 1;
13172 _PyUnicode_WSTR(self) = NULL;
13173 _PyUnicode_UTF8_LENGTH(self) = 0;
13174 _PyUnicode_UTF8(self) = NULL;
13175 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013176 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013177
13178 share_utf8 = 0;
13179 share_wstr = 0;
13180 if (kind == PyUnicode_1BYTE_KIND) {
13181 char_size = 1;
13182 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13183 share_utf8 = 1;
13184 }
13185 else if (kind == PyUnicode_2BYTE_KIND) {
13186 char_size = 2;
13187 if (sizeof(wchar_t) == 2)
13188 share_wstr = 1;
13189 }
13190 else {
13191 assert(kind == PyUnicode_4BYTE_KIND);
13192 char_size = 4;
13193 if (sizeof(wchar_t) == 4)
13194 share_wstr = 1;
13195 }
13196
13197 /* Ensure we won't overflow the length. */
13198 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13199 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013201 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013202 data = PyObject_MALLOC((length + 1) * char_size);
13203 if (data == NULL) {
13204 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 goto onError;
13206 }
13207
Victor Stinnerc3c74152011-10-02 20:39:55 +020013208 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013209 if (share_utf8) {
13210 _PyUnicode_UTF8_LENGTH(self) = length;
13211 _PyUnicode_UTF8(self) = data;
13212 }
13213 if (share_wstr) {
13214 _PyUnicode_WSTR_LENGTH(self) = length;
13215 _PyUnicode_WSTR(self) = (wchar_t *)data;
13216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013218 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13219 PyUnicode_KIND_SIZE(kind, length + 1));
13220 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013221 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013222 return (PyObject *)self;
13223
13224onError:
13225 Py_DECREF(unicode);
13226 Py_DECREF(self);
13227 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013228}
13229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013230PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013232\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013233Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013234encoding defaults to the current default string encoding.\n\
13235errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013236
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013237static PyObject *unicode_iter(PyObject *seq);
13238
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013240 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013241 "str", /* tp_name */
13242 sizeof(PyUnicodeObject), /* tp_size */
13243 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013245 (destructor)unicode_dealloc, /* tp_dealloc */
13246 0, /* tp_print */
13247 0, /* tp_getattr */
13248 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013249 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013250 unicode_repr, /* tp_repr */
13251 &unicode_as_number, /* tp_as_number */
13252 &unicode_as_sequence, /* tp_as_sequence */
13253 &unicode_as_mapping, /* tp_as_mapping */
13254 (hashfunc) unicode_hash, /* tp_hash*/
13255 0, /* tp_call*/
13256 (reprfunc) unicode_str, /* tp_str */
13257 PyObject_GenericGetAttr, /* tp_getattro */
13258 0, /* tp_setattro */
13259 0, /* tp_as_buffer */
13260 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013261 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013262 unicode_doc, /* tp_doc */
13263 0, /* tp_traverse */
13264 0, /* tp_clear */
13265 PyUnicode_RichCompare, /* tp_richcompare */
13266 0, /* tp_weaklistoffset */
13267 unicode_iter, /* tp_iter */
13268 0, /* tp_iternext */
13269 unicode_methods, /* tp_methods */
13270 0, /* tp_members */
13271 0, /* tp_getset */
13272 &PyBaseObject_Type, /* tp_base */
13273 0, /* tp_dict */
13274 0, /* tp_descr_get */
13275 0, /* tp_descr_set */
13276 0, /* tp_dictoffset */
13277 0, /* tp_init */
13278 0, /* tp_alloc */
13279 unicode_new, /* tp_new */
13280 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281};
13282
13283/* Initialize the Unicode implementation */
13284
Thomas Wouters78890102000-07-22 19:25:51 +000013285void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013287 int i;
13288
Thomas Wouters477c8d52006-05-27 19:21:47 +000013289 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013291 0x000A, /* LINE FEED */
13292 0x000D, /* CARRIAGE RETURN */
13293 0x001C, /* FILE SEPARATOR */
13294 0x001D, /* GROUP SEPARATOR */
13295 0x001E, /* RECORD SEPARATOR */
13296 0x0085, /* NEXT LINE */
13297 0x2028, /* LINE SEPARATOR */
13298 0x2029, /* PARAGRAPH SEPARATOR */
13299 };
13300
Fred Drakee4315f52000-05-09 19:53:39 +000013301 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013302 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013303 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013304 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013306
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013307 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013309 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013310 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013311
13312 /* initialize the linebreak bloom filter */
13313 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013315 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013316
13317 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318}
13319
13320/* Finalize the Unicode implementation */
13321
Christian Heimesa156e092008-02-16 07:38:31 +000013322int
13323PyUnicode_ClearFreeList(void)
13324{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013326}
13327
Guido van Rossumd57fd912000-03-10 22:53:23 +000013328void
Thomas Wouters78890102000-07-22 19:25:51 +000013329_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013331 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013332
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013333 Py_XDECREF(unicode_empty);
13334 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013335
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013336 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013337 if (unicode_latin1[i]) {
13338 Py_DECREF(unicode_latin1[i]);
13339 unicode_latin1[i] = NULL;
13340 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013341 }
Christian Heimesa156e092008-02-16 07:38:31 +000013342 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013344
Walter Dörwald16807132007-05-25 13:52:07 +000013345void
13346PyUnicode_InternInPlace(PyObject **p)
13347{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013348 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13349 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013350#ifdef Py_DEBUG
13351 assert(s != NULL);
13352 assert(_PyUnicode_CHECK(s));
13353#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013354 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013355 return;
13356#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013357 /* If it's a subclass, we don't really know what putting
13358 it in the interned dict might do. */
13359 if (!PyUnicode_CheckExact(s))
13360 return;
13361 if (PyUnicode_CHECK_INTERNED(s))
13362 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013363 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013364 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 return;
13366 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013367 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013368 if (interned == NULL) {
13369 interned = PyDict_New();
13370 if (interned == NULL) {
13371 PyErr_Clear(); /* Don't leave an exception */
13372 return;
13373 }
13374 }
13375 /* It might be that the GetItem call fails even
13376 though the key is present in the dictionary,
13377 namely when this happens during a stack overflow. */
13378 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013379 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013380 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013381
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 if (t) {
13383 Py_INCREF(t);
13384 Py_DECREF(*p);
13385 *p = t;
13386 return;
13387 }
Walter Dörwald16807132007-05-25 13:52:07 +000013388
Benjamin Peterson14339b62009-01-31 16:36:08 +000013389 PyThreadState_GET()->recursion_critical = 1;
13390 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13391 PyErr_Clear();
13392 PyThreadState_GET()->recursion_critical = 0;
13393 return;
13394 }
13395 PyThreadState_GET()->recursion_critical = 0;
13396 /* The two references in interned are not counted by refcnt.
13397 The deallocator will take care of this */
13398 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013399 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013400}
13401
13402void
13403PyUnicode_InternImmortal(PyObject **p)
13404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013405 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13406
Benjamin Peterson14339b62009-01-31 16:36:08 +000013407 PyUnicode_InternInPlace(p);
13408 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013409 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013410 Py_INCREF(*p);
13411 }
Walter Dörwald16807132007-05-25 13:52:07 +000013412}
13413
13414PyObject *
13415PyUnicode_InternFromString(const char *cp)
13416{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013417 PyObject *s = PyUnicode_FromString(cp);
13418 if (s == NULL)
13419 return NULL;
13420 PyUnicode_InternInPlace(&s);
13421 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013422}
13423
Alexander Belopolsky40018472011-02-26 01:02:56 +000013424void
13425_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013426{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013427 PyObject *keys;
13428 PyUnicodeObject *s;
13429 Py_ssize_t i, n;
13430 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013431
Benjamin Peterson14339b62009-01-31 16:36:08 +000013432 if (interned == NULL || !PyDict_Check(interned))
13433 return;
13434 keys = PyDict_Keys(interned);
13435 if (keys == NULL || !PyList_Check(keys)) {
13436 PyErr_Clear();
13437 return;
13438 }
Walter Dörwald16807132007-05-25 13:52:07 +000013439
Benjamin Peterson14339b62009-01-31 16:36:08 +000013440 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13441 detector, interned unicode strings are not forcibly deallocated;
13442 rather, we give them their stolen references back, and then clear
13443 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013444
Benjamin Peterson14339b62009-01-31 16:36:08 +000013445 n = PyList_GET_SIZE(keys);
13446 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013448 for (i = 0; i < n; i++) {
13449 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013450 if (PyUnicode_READY(s) == -1) {
13451 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013452 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013454 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013455 case SSTATE_NOT_INTERNED:
13456 /* XXX Shouldn't happen */
13457 break;
13458 case SSTATE_INTERNED_IMMORTAL:
13459 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013460 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013461 break;
13462 case SSTATE_INTERNED_MORTAL:
13463 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013464 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013465 break;
13466 default:
13467 Py_FatalError("Inconsistent interned string state.");
13468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013469 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013470 }
13471 fprintf(stderr, "total size of all interned strings: "
13472 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13473 "mortal/immortal\n", mortal_size, immortal_size);
13474 Py_DECREF(keys);
13475 PyDict_Clear(interned);
13476 Py_DECREF(interned);
13477 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013478}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013479
13480
13481/********************* Unicode Iterator **************************/
13482
13483typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013484 PyObject_HEAD
13485 Py_ssize_t it_index;
13486 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013487} unicodeiterobject;
13488
13489static void
13490unicodeiter_dealloc(unicodeiterobject *it)
13491{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013492 _PyObject_GC_UNTRACK(it);
13493 Py_XDECREF(it->it_seq);
13494 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013495}
13496
13497static int
13498unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13499{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013500 Py_VISIT(it->it_seq);
13501 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013502}
13503
13504static PyObject *
13505unicodeiter_next(unicodeiterobject *it)
13506{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013507 PyUnicodeObject *seq;
13508 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013509
Benjamin Peterson14339b62009-01-31 16:36:08 +000013510 assert(it != NULL);
13511 seq = it->it_seq;
13512 if (seq == NULL)
13513 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013514 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013516 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13517 int kind = PyUnicode_KIND(seq);
13518 void *data = PyUnicode_DATA(seq);
13519 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13520 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013521 if (item != NULL)
13522 ++it->it_index;
13523 return item;
13524 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013525
Benjamin Peterson14339b62009-01-31 16:36:08 +000013526 Py_DECREF(seq);
13527 it->it_seq = NULL;
13528 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013529}
13530
13531static PyObject *
13532unicodeiter_len(unicodeiterobject *it)
13533{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013534 Py_ssize_t len = 0;
13535 if (it->it_seq)
13536 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13537 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013538}
13539
13540PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13541
13542static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013543 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013545 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013546};
13547
13548PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013549 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13550 "str_iterator", /* tp_name */
13551 sizeof(unicodeiterobject), /* tp_basicsize */
13552 0, /* tp_itemsize */
13553 /* methods */
13554 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13555 0, /* tp_print */
13556 0, /* tp_getattr */
13557 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013558 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013559 0, /* tp_repr */
13560 0, /* tp_as_number */
13561 0, /* tp_as_sequence */
13562 0, /* tp_as_mapping */
13563 0, /* tp_hash */
13564 0, /* tp_call */
13565 0, /* tp_str */
13566 PyObject_GenericGetAttr, /* tp_getattro */
13567 0, /* tp_setattro */
13568 0, /* tp_as_buffer */
13569 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13570 0, /* tp_doc */
13571 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13572 0, /* tp_clear */
13573 0, /* tp_richcompare */
13574 0, /* tp_weaklistoffset */
13575 PyObject_SelfIter, /* tp_iter */
13576 (iternextfunc)unicodeiter_next, /* tp_iternext */
13577 unicodeiter_methods, /* tp_methods */
13578 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013579};
13580
13581static PyObject *
13582unicode_iter(PyObject *seq)
13583{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013584 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013585
Benjamin Peterson14339b62009-01-31 16:36:08 +000013586 if (!PyUnicode_Check(seq)) {
13587 PyErr_BadInternalCall();
13588 return NULL;
13589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013590 if (PyUnicode_READY(seq) == -1)
13591 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013592 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13593 if (it == NULL)
13594 return NULL;
13595 it->it_index = 0;
13596 Py_INCREF(seq);
13597 it->it_seq = (PyUnicodeObject *)seq;
13598 _PyObject_GC_TRACK(it);
13599 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013600}
13601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602#define UNIOP(x) Py_UNICODE_##x
13603#define UNIOP_t Py_UNICODE
13604#include "uniops.h"
13605#undef UNIOP
13606#undef UNIOP_t
13607#define UNIOP(x) Py_UCS4_##x
13608#define UNIOP_t Py_UCS4
13609#include "uniops.h"
13610#undef UNIOP
13611#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013612
Victor Stinner71133ff2010-09-01 23:43:53 +000013613Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013614PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013615{
13616 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13617 Py_UNICODE *copy;
13618 Py_ssize_t size;
13619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013620 if (!PyUnicode_Check(unicode)) {
13621 PyErr_BadArgument();
13622 return NULL;
13623 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013624 /* Ensure we won't overflow the size. */
13625 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13626 PyErr_NoMemory();
13627 return NULL;
13628 }
13629 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13630 size *= sizeof(Py_UNICODE);
13631 copy = PyMem_Malloc(size);
13632 if (copy == NULL) {
13633 PyErr_NoMemory();
13634 return NULL;
13635 }
13636 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13637 return copy;
13638}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013639
Georg Brandl66c221e2010-10-14 07:04:07 +000013640/* A _string module, to export formatter_parser and formatter_field_name_split
13641 to the string.Formatter class implemented in Python. */
13642
13643static PyMethodDef _string_methods[] = {
13644 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13645 METH_O, PyDoc_STR("split the argument as a field name")},
13646 {"formatter_parser", (PyCFunction) formatter_parser,
13647 METH_O, PyDoc_STR("parse the argument as a format string")},
13648 {NULL, NULL}
13649};
13650
13651static struct PyModuleDef _string_module = {
13652 PyModuleDef_HEAD_INIT,
13653 "_string",
13654 PyDoc_STR("string helper module"),
13655 0,
13656 _string_methods,
13657 NULL,
13658 NULL,
13659 NULL,
13660 NULL
13661};
13662
13663PyMODINIT_FUNC
13664PyInit__string(void)
13665{
13666 return PyModule_Create(&_string_module);
13667}
13668
13669
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013670#ifdef __cplusplus
13671}
13672#endif