blob: 21b8ab72d18ec4066149cd49ab38038fb459be03 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
97# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
98#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
244unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000245 PyObject **errorHandler,const char *encoding, const char *reason,
246 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
247 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
248
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249static void
250raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300251 const char *encoding,
252 const Py_UNICODE *unicode, Py_ssize_t size,
253 Py_ssize_t startpos, Py_ssize_t endpos,
254 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000255
Christian Heimes190d79e2008-01-30 11:58:22 +0000256/* Same for linebreaks */
257static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000259/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000260/* 0x000B, * LINE TABULATION */
261/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000262/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000263 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000265/* 0x001C, * FILE SEPARATOR */
266/* 0x001D, * GROUP SEPARATOR */
267/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 1, 1, 1, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000282};
283
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300284/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
285 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000287PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000289#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 /* This is actually an illegal character, so it should
293 not be passed to unichr. */
294 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#endif
296}
297
Victor Stinner910337b2011-10-03 03:20:16 +0200298#ifdef Py_DEBUG
299static int
300_PyUnicode_CheckConsistency(void *op)
301{
302 PyASCIIObject *ascii;
303 unsigned int kind;
304
305 assert(PyUnicode_Check(op));
306
307 ascii = (PyASCIIObject *)op;
308 kind = ascii->state.kind;
309
Victor Stinnera3b334d2011-10-03 13:53:37 +0200310 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200311 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(ascii->state.ready == 1);
313 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200315 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200316 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200317
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 if (ascii->state.compact == 1) {
319 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND
321 || kind == PyUnicode_2BYTE_KIND
322 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200324 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 assert (compact->utf8 != data);
326 } else {
327 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
328
329 data = unicode->data.any;
330 if (kind == PyUnicode_WCHAR_KIND) {
331 assert(ascii->state.compact == 0);
332 assert(ascii->state.ascii == 0);
333 assert(ascii->state.ready == 0);
334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
337 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
338 }
339 else {
340 assert(kind == PyUnicode_1BYTE_KIND
341 || kind == PyUnicode_2BYTE_KIND
342 || kind == PyUnicode_4BYTE_KIND);
343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ready == 1);
345 assert(data != NULL);
346 if (ascii->state.ascii) {
347 assert (compact->utf8 == data);
348 assert (compact->utf8_length == ascii->length);
349 }
350 else
351 assert (compact->utf8 != data);
352 }
353 }
354 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200355 if (
356#if SIZEOF_WCHAR_T == 2
357 kind == PyUnicode_2BYTE_KIND
358#else
359 kind == PyUnicode_4BYTE_KIND
360#endif
361 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200362 {
363 assert(ascii->wstr == data);
364 assert(compact->wstr_length == ascii->length);
365 } else
366 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200367 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200368
369 if (compact->utf8 == NULL)
370 assert(compact->utf8_length == 0);
371 if (ascii->wstr == NULL)
372 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200373 }
374 return 1;
375}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400376#else
377static int
378_PyUnicode_CheckConsistency(void *op)
379{
380 return 1;
381}
Victor Stinner910337b2011-10-03 03:20:16 +0200382#endif
383
Thomas Wouters477c8d52006-05-27 19:21:47 +0000384/* --- Bloom Filters ----------------------------------------------------- */
385
386/* stuff to implement simple "bloom filters" for Unicode characters.
387 to keep things simple, we use a single bitmask, using the least 5
388 bits from each unicode characters as the bit index. */
389
390/* the linebreak mask is set up by Unicode_Init below */
391
Antoine Pitrouf068f942010-01-13 14:19:12 +0000392#if LONG_BIT >= 128
393#define BLOOM_WIDTH 128
394#elif LONG_BIT >= 64
395#define BLOOM_WIDTH 64
396#elif LONG_BIT >= 32
397#define BLOOM_WIDTH 32
398#else
399#error "LONG_BIT is smaller than 32"
400#endif
401
Thomas Wouters477c8d52006-05-27 19:21:47 +0000402#define BLOOM_MASK unsigned long
403
404static BLOOM_MASK bloom_linebreak;
405
Antoine Pitrouf068f942010-01-13 14:19:12 +0000406#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
407#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Benjamin Peterson29060642009-01-31 22:14:21 +0000409#define BLOOM_LINEBREAK(ch) \
410 ((ch) < 128U ? ascii_linebreak[(ch)] : \
411 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000412
Alexander Belopolsky40018472011-02-26 01:02:56 +0000413Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415{
416 /* calculate simple bloom-style bitmask for a given unicode string */
417
Antoine Pitrouf068f942010-01-13 14:19:12 +0000418 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000419 Py_ssize_t i;
420
421 mask = 0;
422 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000424
425 return mask;
426}
427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200428#define BLOOM_MEMBER(mask, chr, str) \
429 (BLOOM(mask, chr) \
430 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000431
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432/* --- Unicode Object ----------------------------------------------------- */
433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200434static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200435fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200436
437Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
438 Py_ssize_t size, Py_UCS4 ch,
439 int direction)
440{
441 /* like wcschr, but doesn't stop at NULL characters */
442 Py_ssize_t i;
443 if (direction == 1) {
444 for(i = 0; i < size; i++)
445 if (PyUnicode_READ(kind, s, i) == ch)
446 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
447 }
448 else {
449 for(i = size-1; i >= 0; i--)
450 if (PyUnicode_READ(kind, s, i) == ch)
451 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
452 }
453 return NULL;
454}
455
Victor Stinnerfe226c02011-10-03 03:52:20 +0200456static PyObject*
457resize_compact(PyObject *unicode, Py_ssize_t length)
458{
459 Py_ssize_t char_size;
460 Py_ssize_t struct_size;
461 Py_ssize_t new_size;
462 int share_wstr;
463
464 assert(PyUnicode_IS_READY(unicode));
465 char_size = PyUnicode_CHARACTER_SIZE(unicode);
466 if (PyUnicode_IS_COMPACT_ASCII(unicode))
467 struct_size = sizeof(PyASCIIObject);
468 else
469 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200470 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200471
472 _Py_DEC_REFTOTAL;
473 _Py_ForgetReference(unicode);
474
475 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
476 PyErr_NoMemory();
477 return NULL;
478 }
479 new_size = (struct_size + (length + 1) * char_size);
480
481 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
482 if (unicode == NULL) {
483 PyObject_Del(unicode);
484 PyErr_NoMemory();
485 return NULL;
486 }
487 _Py_NewReference(unicode);
488 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200489 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200491 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
492 _PyUnicode_WSTR_LENGTH(unicode) = length;
493 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200494 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
495 length, 0);
496 return unicode;
497}
498
Alexander Belopolsky40018472011-02-26 01:02:56 +0000499static int
Victor Stinner95663112011-10-04 01:03:50 +0200500resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501{
Victor Stinner95663112011-10-04 01:03:50 +0200502 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200503 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200504 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000505
Victor Stinner95663112011-10-04 01:03:50 +0200506 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200507
508 if (PyUnicode_IS_READY(unicode)) {
509 Py_ssize_t char_size;
510 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200511 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200512 void *data;
513
514 data = _PyUnicode_DATA_ANY(unicode);
515 assert(data != NULL);
516 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200517 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
518 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200519 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
520 {
521 PyObject_DEL(_PyUnicode_UTF8(unicode));
522 _PyUnicode_UTF8(unicode) = NULL;
523 _PyUnicode_UTF8_LENGTH(unicode) = 0;
524 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200525
526 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
527 PyErr_NoMemory();
528 return -1;
529 }
530 new_size = (length + 1) * char_size;
531
532 data = (PyObject *)PyObject_REALLOC(data, new_size);
533 if (data == NULL) {
534 PyErr_NoMemory();
535 return -1;
536 }
537 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200538 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200539 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_WSTR_LENGTH(unicode) = length;
541 }
542 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200543 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200544 _PyUnicode_UTF8_LENGTH(unicode) = length;
545 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 _PyUnicode_LENGTH(unicode) = length;
547 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200548 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400549 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200550 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200551 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552 }
Victor Stinner95663112011-10-04 01:03:50 +0200553 assert(_PyUnicode_WSTR(unicode) != NULL);
554
555 /* check for integer overflow */
556 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
557 PyErr_NoMemory();
558 return -1;
559 }
560 wstr = _PyUnicode_WSTR(unicode);
561 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
562 if (!wstr) {
563 PyErr_NoMemory();
564 return -1;
565 }
566 _PyUnicode_WSTR(unicode) = wstr;
567 _PyUnicode_WSTR(unicode)[length] = 0;
568 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400569 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 return 0;
571}
572
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573static PyObject*
574resize_copy(PyObject *unicode, Py_ssize_t length)
575{
576 Py_ssize_t copy_length;
577 if (PyUnicode_IS_COMPACT(unicode)) {
578 PyObject *copy;
579 assert(PyUnicode_IS_READY(unicode));
580
581 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
582 if (copy == NULL)
583 return NULL;
584
585 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
586 if (PyUnicode_CopyCharacters(copy, 0,
587 unicode, 0,
588 copy_length) < 0)
589 {
590 Py_DECREF(copy);
591 return NULL;
592 }
593 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200594 }
595 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200596 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200597 assert(_PyUnicode_WSTR(unicode) != NULL);
598 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200599 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600 if (w == NULL)
601 return NULL;
602 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
603 copy_length = Py_MIN(copy_length, length);
604 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
605 copy_length);
606 return (PyObject*)w;
607 }
608}
609
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000611 Ux0000 terminated; some code (e.g. new_identifier)
612 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613
614 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000615 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000616
617*/
618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200619#ifdef Py_DEBUG
620int unicode_old_new_calls = 0;
621#endif
622
Alexander Belopolsky40018472011-02-26 01:02:56 +0000623static PyUnicodeObject *
624_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000625{
626 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200627 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000628
Thomas Wouters477c8d52006-05-27 19:21:47 +0000629 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630 if (length == 0 && unicode_empty != NULL) {
631 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200632 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000633 }
634
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000635 /* Ensure we won't overflow the size. */
636 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
637 return (PyUnicodeObject *)PyErr_NoMemory();
638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200639 if (length < 0) {
640 PyErr_SetString(PyExc_SystemError,
641 "Negative size passed to _PyUnicode_New");
642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643 }
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645#ifdef Py_DEBUG
646 ++unicode_old_new_calls;
647#endif
648
649 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
650 if (unicode == NULL)
651 return NULL;
652 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
653 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
654 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000655 PyErr_NoMemory();
656 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200658
Jeremy Hyltond8082792003-09-16 19:41:39 +0000659 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000660 * the caller fails before initializing str -- unicode_resize()
661 * reads str[0], and the Keep-Alive optimization can keep memory
662 * allocated for str alive across a call to unicode_dealloc(unicode).
663 * We don't want unicode_resize to read uninitialized memory in
664 * that case.
665 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666 _PyUnicode_WSTR(unicode)[0] = 0;
667 _PyUnicode_WSTR(unicode)[length] = 0;
668 _PyUnicode_WSTR_LENGTH(unicode) = length;
669 _PyUnicode_HASH(unicode) = -1;
670 _PyUnicode_STATE(unicode).interned = 0;
671 _PyUnicode_STATE(unicode).kind = 0;
672 _PyUnicode_STATE(unicode).compact = 0;
673 _PyUnicode_STATE(unicode).ready = 0;
674 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200675 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200677 _PyUnicode_UTF8(unicode) = NULL;
678 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000679 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680
Benjamin Peterson29060642009-01-31 22:14:21 +0000681 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000682 /* XXX UNREF/NEWREF interface should be more symmetrical */
683 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000684 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000685 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000686 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687}
688
Victor Stinnerf42dc442011-10-02 23:33:16 +0200689static const char*
690unicode_kind_name(PyObject *unicode)
691{
Victor Stinner42dfd712011-10-03 14:41:45 +0200692 /* don't check consistency: unicode_kind_name() is called from
693 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200694 if (!PyUnicode_IS_COMPACT(unicode))
695 {
696 if (!PyUnicode_IS_READY(unicode))
697 return "wstr";
698 switch(PyUnicode_KIND(unicode))
699 {
700 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200701 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200702 return "legacy ascii";
703 else
704 return "legacy latin1";
705 case PyUnicode_2BYTE_KIND:
706 return "legacy UCS2";
707 case PyUnicode_4BYTE_KIND:
708 return "legacy UCS4";
709 default:
710 return "<legacy invalid kind>";
711 }
712 }
713 assert(PyUnicode_IS_READY(unicode));
714 switch(PyUnicode_KIND(unicode))
715 {
716 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200717 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200718 return "ascii";
719 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200722 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200723 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200724 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200725 default:
726 return "<invalid compact kind>";
727 }
728}
729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200730#ifdef Py_DEBUG
731int unicode_new_new_calls = 0;
732
733/* Functions wrapping macros for use in debugger */
734char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200735 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736}
737
738void *_PyUnicode_compact_data(void *unicode) {
739 return _PyUnicode_COMPACT_DATA(unicode);
740}
741void *_PyUnicode_data(void *unicode){
742 printf("obj %p\n", unicode);
743 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
744 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
745 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
746 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
747 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
748 return PyUnicode_DATA(unicode);
749}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200750
751void
752_PyUnicode_Dump(PyObject *op)
753{
754 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200755 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
756 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
757 void *data;
758 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
759 if (ascii->state.compact)
760 data = (compact + 1);
761 else
762 data = unicode->data.any;
763 if (ascii->wstr == data)
764 printf("shared ");
765 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200766 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200767 printf(" (%zu), ", compact->wstr_length);
768 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
769 printf("shared ");
770 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200771 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200772 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200773}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200774#endif
775
776PyObject *
777PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
778{
779 PyObject *obj;
780 PyCompactUnicodeObject *unicode;
781 void *data;
782 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200783 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784 Py_ssize_t char_size;
785 Py_ssize_t struct_size;
786
787 /* Optimization for empty strings */
788 if (size == 0 && unicode_empty != NULL) {
789 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200790 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200791 }
792
793#ifdef Py_DEBUG
794 ++unicode_new_new_calls;
795#endif
796
Victor Stinner9e9d6892011-10-04 01:02:02 +0200797 is_ascii = 0;
798 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200799 struct_size = sizeof(PyCompactUnicodeObject);
800 if (maxchar < 128) {
801 kind_state = PyUnicode_1BYTE_KIND;
802 char_size = 1;
803 is_ascii = 1;
804 struct_size = sizeof(PyASCIIObject);
805 }
806 else if (maxchar < 256) {
807 kind_state = PyUnicode_1BYTE_KIND;
808 char_size = 1;
809 }
810 else if (maxchar < 65536) {
811 kind_state = PyUnicode_2BYTE_KIND;
812 char_size = 2;
813 if (sizeof(wchar_t) == 2)
814 is_sharing = 1;
815 }
816 else {
817 kind_state = PyUnicode_4BYTE_KIND;
818 char_size = 4;
819 if (sizeof(wchar_t) == 4)
820 is_sharing = 1;
821 }
822
823 /* Ensure we won't overflow the size. */
824 if (size < 0) {
825 PyErr_SetString(PyExc_SystemError,
826 "Negative size passed to PyUnicode_New");
827 return NULL;
828 }
829 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
830 return PyErr_NoMemory();
831
832 /* Duplicated allocation code from _PyObject_New() instead of a call to
833 * PyObject_New() so we are able to allocate space for the object and
834 * it's data buffer.
835 */
836 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
837 if (obj == NULL)
838 return PyErr_NoMemory();
839 obj = PyObject_INIT(obj, &PyUnicode_Type);
840 if (obj == NULL)
841 return NULL;
842
843 unicode = (PyCompactUnicodeObject *)obj;
844 if (is_ascii)
845 data = ((PyASCIIObject*)obj) + 1;
846 else
847 data = unicode + 1;
848 _PyUnicode_LENGTH(unicode) = size;
849 _PyUnicode_HASH(unicode) = -1;
850 _PyUnicode_STATE(unicode).interned = 0;
851 _PyUnicode_STATE(unicode).kind = kind_state;
852 _PyUnicode_STATE(unicode).compact = 1;
853 _PyUnicode_STATE(unicode).ready = 1;
854 _PyUnicode_STATE(unicode).ascii = is_ascii;
855 if (is_ascii) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 }
859 else if (kind_state == PyUnicode_1BYTE_KIND) {
860 ((char*)data)[size] = 0;
861 _PyUnicode_WSTR(unicode) = NULL;
862 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 }
866 else {
867 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200868 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200869 if (kind_state == PyUnicode_2BYTE_KIND)
870 ((Py_UCS2*)data)[size] = 0;
871 else /* kind_state == PyUnicode_4BYTE_KIND */
872 ((Py_UCS4*)data)[size] = 0;
873 if (is_sharing) {
874 _PyUnicode_WSTR_LENGTH(unicode) = size;
875 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
876 }
877 else {
878 _PyUnicode_WSTR_LENGTH(unicode) = 0;
879 _PyUnicode_WSTR(unicode) = NULL;
880 }
881 }
882 return obj;
883}
884
885#if SIZEOF_WCHAR_T == 2
886/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
887 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200888 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889
890 This function assumes that unicode can hold one more code point than wstr
891 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200892static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
894 PyUnicodeObject *unicode)
895{
896 const wchar_t *iter;
897 Py_UCS4 *ucs4_out;
898
Victor Stinner910337b2011-10-03 03:20:16 +0200899 assert(unicode != NULL);
900 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
902 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
903
904 for (iter = begin; iter < end; ) {
905 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
906 _PyUnicode_GET_LENGTH(unicode)));
907 if (*iter >= 0xD800 && *iter <= 0xDBFF
908 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
909 {
910 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
911 iter += 2;
912 }
913 else {
914 *ucs4_out++ = *iter;
915 iter++;
916 }
917 }
918 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
919 _PyUnicode_GET_LENGTH(unicode)));
920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921}
922#endif
923
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924static int
925_PyUnicode_Dirty(PyObject *unicode)
926{
Victor Stinner910337b2011-10-03 03:20:16 +0200927 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200928 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200929 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200930 "Cannot modify a string having more than 1 reference");
931 return -1;
932 }
933 _PyUnicode_DIRTY(unicode);
934 return 0;
935}
936
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200937Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
939 PyObject *from, Py_ssize_t from_start,
940 Py_ssize_t how_many)
941{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200942 unsigned int from_kind, to_kind;
943 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200944
Victor Stinnerb1536152011-09-30 02:26:10 +0200945 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
946 PyErr_BadInternalCall();
947 return -1;
948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949
950 if (PyUnicode_READY(from))
951 return -1;
952 if (PyUnicode_READY(to))
953 return -1;
954
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200955 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200956 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200957 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200958 "Cannot write %zi characters at %zi "
959 "in a string of %zi characters",
960 how_many, to_start, PyUnicode_GET_LENGTH(to));
961 return -1;
962 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 if (how_many == 0)
964 return 0;
965
Victor Stinnercd9950f2011-10-02 00:34:53 +0200966 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200967 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200970 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200972 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973
Victor Stinnerf42dc442011-10-02 23:33:16 +0200974 if (from_kind == to_kind
975 /* deny latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +0200976 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200977 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200978 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200979 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200981 + PyUnicode_KIND_SIZE(from_kind, from_start),
982 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200984 else if (from_kind == PyUnicode_1BYTE_KIND
985 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200986 {
987 _PyUnicode_CONVERT_BYTES(
988 Py_UCS1, Py_UCS2,
989 PyUnicode_1BYTE_DATA(from) + from_start,
990 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
991 PyUnicode_2BYTE_DATA(to) + to_start
992 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200993 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200994 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200995 && to_kind == PyUnicode_4BYTE_KIND)
996 {
997 _PyUnicode_CONVERT_BYTES(
998 Py_UCS1, Py_UCS4,
999 PyUnicode_1BYTE_DATA(from) + from_start,
1000 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1001 PyUnicode_4BYTE_DATA(to) + to_start
1002 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001003 }
1004 else if (from_kind == PyUnicode_2BYTE_KIND
1005 && to_kind == PyUnicode_4BYTE_KIND)
1006 {
1007 _PyUnicode_CONVERT_BYTES(
1008 Py_UCS2, Py_UCS4,
1009 PyUnicode_2BYTE_DATA(from) + from_start,
1010 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1011 PyUnicode_4BYTE_DATA(to) + to_start
1012 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001013 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001014 else {
1015 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001016
1017 /* check if max_char(from substring) <= max_char(to) */
1018 if (from_kind > to_kind
1019 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001020 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001021 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001022 /* slow path to check for character overflow */
1023 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1024 Py_UCS4 ch, maxchar;
1025 Py_ssize_t i;
1026
1027 maxchar = 0;
1028 invalid_kinds = 0;
1029 for (i=0; i < how_many; i++) {
1030 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1031 if (ch > maxchar) {
1032 maxchar = ch;
1033 if (maxchar > to_maxchar) {
1034 invalid_kinds = 1;
1035 break;
1036 }
1037 }
1038 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1039 }
1040 }
1041 else
1042 invalid_kinds = 1;
1043 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001044 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001045 "Cannot copy %s characters "
1046 "into a string of %s characters",
1047 unicode_kind_name(from),
1048 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001049 return -1;
1050 }
1051 }
1052 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053}
1054
Victor Stinner17222162011-09-28 22:15:37 +02001055/* Find the maximum code point and count the number of surrogate pairs so a
1056 correct string length can be computed before converting a string to UCS4.
1057 This function counts single surrogates as a character and not as a pair.
1058
1059 Return 0 on success, or -1 on error. */
1060static int
1061find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1062 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063{
1064 const wchar_t *iter;
1065
Victor Stinnerc53be962011-10-02 21:33:54 +02001066 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 *num_surrogates = 0;
1068 *maxchar = 0;
1069
1070 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001071 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001073#if SIZEOF_WCHAR_T != 2
1074 if (*maxchar >= 0x10000)
1075 return 0;
1076#endif
1077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078#if SIZEOF_WCHAR_T == 2
1079 if (*iter >= 0xD800 && *iter <= 0xDBFF
1080 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1081 {
1082 Py_UCS4 surrogate_val;
1083 surrogate_val = (((iter[0] & 0x3FF)<<10)
1084 | (iter[1] & 0x3FF)) + 0x10000;
1085 ++(*num_surrogates);
1086 if (surrogate_val > *maxchar)
1087 *maxchar = surrogate_val;
1088 iter += 2;
1089 }
1090 else
1091 iter++;
1092#else
1093 iter++;
1094#endif
1095 }
1096 return 0;
1097}
1098
1099#ifdef Py_DEBUG
1100int unicode_ready_calls = 0;
1101#endif
1102
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001103static int
1104unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 wchar_t *end;
1108 Py_UCS4 maxchar = 0;
1109 Py_ssize_t num_surrogates;
1110#if SIZEOF_WCHAR_T == 2
1111 Py_ssize_t length_wo_surrogates;
1112#endif
1113
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001114 assert(p_obj != NULL);
1115 unicode = (PyUnicodeObject *)*p_obj;
1116
Georg Brandl7597add2011-10-05 16:36:47 +02001117 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001118 strings were created using _PyObject_New() and where no canonical
1119 representation (the str field) has been set yet aka strings
1120 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001121 assert(_PyUnicode_CHECK(unicode));
1122 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001124 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001125 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001126 /* Actually, it should neither be interned nor be anything else: */
1127 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128
1129#ifdef Py_DEBUG
1130 ++unicode_ready_calls;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133#ifdef Py_DEBUG
1134 assert(!replace || Py_REFCNT(unicode) == 1);
1135#else
1136 if (replace && Py_REFCNT(unicode) != 1)
1137 replace = 0;
1138#endif
1139 if (replace) {
1140 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1141 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1142 /* Optimization for empty strings */
1143 if (len == 0) {
1144 Py_INCREF(unicode_empty);
1145 Py_DECREF(*p_obj);
1146 *p_obj = unicode_empty;
1147 return 0;
1148 }
1149 if (len == 1 && wstr[0] < 256) {
1150 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1151 if (latin1_char == NULL)
1152 return -1;
1153 Py_DECREF(*p_obj);
1154 *p_obj = latin1_char;
1155 return 0;
1156 }
1157 }
1158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001160 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001161 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163
1164 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001165 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1166 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 PyErr_NoMemory();
1168 return -1;
1169 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001170 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 _PyUnicode_WSTR(unicode), end,
1172 PyUnicode_1BYTE_DATA(unicode));
1173 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1174 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1175 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1176 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001177 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001178 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001179 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
1181 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001182 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001183 _PyUnicode_UTF8(unicode) = NULL;
1184 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185 }
1186 PyObject_FREE(_PyUnicode_WSTR(unicode));
1187 _PyUnicode_WSTR(unicode) = NULL;
1188 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1189 }
1190 /* In this case we might have to convert down from 4-byte native
1191 wchar_t to 2-byte unicode. */
1192 else if (maxchar < 65536) {
1193 assert(num_surrogates == 0 &&
1194 "FindMaxCharAndNumSurrogatePairs() messed up");
1195
Victor Stinner506f5922011-09-28 22:34:18 +02001196#if SIZEOF_WCHAR_T == 2
1197 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001198 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001199 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1200 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1201 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001204#else
1205 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001206 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001207 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001208 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001209 PyErr_NoMemory();
1210 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 }
Victor Stinner506f5922011-09-28 22:34:18 +02001212 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1213 _PyUnicode_WSTR(unicode), end,
1214 PyUnicode_2BYTE_DATA(unicode));
1215 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1216 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1217 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001218 _PyUnicode_UTF8(unicode) = NULL;
1219 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001220 PyObject_FREE(_PyUnicode_WSTR(unicode));
1221 _PyUnicode_WSTR(unicode) = NULL;
1222 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1223#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 }
1225 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1226 else {
1227#if SIZEOF_WCHAR_T == 2
1228 /* in case the native representation is 2-bytes, we need to allocate a
1229 new normalized 4-byte version. */
1230 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001231 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1232 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 PyErr_NoMemory();
1234 return -1;
1235 }
1236 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1237 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001238 _PyUnicode_UTF8(unicode) = NULL;
1239 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001240 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1241 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001242 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 PyObject_FREE(_PyUnicode_WSTR(unicode));
1244 _PyUnicode_WSTR(unicode) = NULL;
1245 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1246#else
1247 assert(num_surrogates == 0);
1248
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001251 _PyUnicode_UTF8(unicode) = NULL;
1252 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1254#endif
1255 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1256 }
1257 _PyUnicode_STATE(unicode).ready = 1;
1258 return 0;
1259}
1260
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001261int
1262_PyUnicode_ReadyReplace(PyObject **op)
1263{
1264 return unicode_ready(op, 1);
1265}
1266
1267int
1268_PyUnicode_Ready(PyObject *op)
1269{
1270 return unicode_ready(&op, 0);
1271}
1272
Alexander Belopolsky40018472011-02-26 01:02:56 +00001273static void
1274unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275{
Walter Dörwald16807132007-05-25 13:52:07 +00001276 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001277 case SSTATE_NOT_INTERNED:
1278 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001279
Benjamin Peterson29060642009-01-31 22:14:21 +00001280 case SSTATE_INTERNED_MORTAL:
1281 /* revive dead object temporarily for DelItem */
1282 Py_REFCNT(unicode) = 3;
1283 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1284 Py_FatalError(
1285 "deletion of interned string failed");
1286 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001287
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 case SSTATE_INTERNED_IMMORTAL:
1289 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001290
Benjamin Peterson29060642009-01-31 22:14:21 +00001291 default:
1292 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001293 }
1294
Victor Stinner03490912011-10-03 23:45:12 +02001295 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001297 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001298 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300 if (PyUnicode_IS_COMPACT(unicode)) {
1301 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 }
1303 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001304 if (_PyUnicode_DATA_ANY(unicode))
1305 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001306 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 }
1308}
1309
Alexander Belopolsky40018472011-02-26 01:02:56 +00001310static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001311unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001312{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001313 if (Py_REFCNT(unicode) != 1)
1314 return 0;
1315 if (PyUnicode_CHECK_INTERNED(unicode))
1316 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001317 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001318#ifdef Py_DEBUG
1319 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1320 && PyUnicode_GET_LENGTH(unicode) == 1)
1321 {
1322 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 if (ch < 256 && unicode_latin1[ch] == unicode)
1324 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001325 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001326#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001327 return 1;
1328}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001329
Victor Stinnerfe226c02011-10-03 03:52:20 +02001330static int
1331unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1332{
1333 PyObject *unicode;
1334 Py_ssize_t old_length;
1335
1336 assert(p_unicode != NULL);
1337 unicode = *p_unicode;
1338
1339 assert(unicode != NULL);
1340 assert(PyUnicode_Check(unicode));
1341 assert(0 <= length);
1342
Victor Stinner910337b2011-10-03 03:20:16 +02001343 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001344 old_length = PyUnicode_WSTR_LENGTH(unicode);
1345 else
1346 old_length = PyUnicode_GET_LENGTH(unicode);
1347 if (old_length == length)
1348 return 0;
1349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 if (!unicode_resizable(unicode)) {
1351 PyObject *copy = resize_copy(unicode, length);
1352 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001354 Py_DECREF(*p_unicode);
1355 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001356 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001357 }
1358
Victor Stinnerfe226c02011-10-03 03:52:20 +02001359 if (PyUnicode_IS_COMPACT(unicode)) {
1360 *p_unicode = resize_compact(unicode, length);
1361 if (*p_unicode == NULL)
1362 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001363 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001365 }
1366 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001367}
1368
Alexander Belopolsky40018472011-02-26 01:02:56 +00001369int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001371{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001372 PyObject *unicode;
1373 if (p_unicode == NULL) {
1374 PyErr_BadInternalCall();
1375 return -1;
1376 }
1377 unicode = *p_unicode;
1378 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1379 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1380 {
1381 PyErr_BadInternalCall();
1382 return -1;
1383 }
1384 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001385}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387static PyObject*
1388get_latin1_char(unsigned char ch)
1389{
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001392 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 if (!unicode)
1394 return NULL;
1395 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1396 unicode_latin1[ch] = unicode;
1397 }
1398 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001399 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400}
1401
Alexander Belopolsky40018472011-02-26 01:02:56 +00001402PyObject *
1403PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404{
1405 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 Py_UCS4 maxchar = 0;
1407 Py_ssize_t num_surrogates;
1408
1409 if (u == NULL)
1410 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001412 /* If the Unicode data is known at construction time, we can apply
1413 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 /* Optimization for empty strings */
1416 if (size == 0 && unicode_empty != NULL) {
1417 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001418 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001419 }
Tim Petersced69f82003-09-16 20:30:58 +00001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 /* Single character Unicode objects in the Latin-1 range are
1422 shared when using this constructor */
1423 if (size == 1 && *u < 256)
1424 return get_latin1_char((unsigned char)*u);
1425
1426 /* If not empty and not single character, copy the Unicode data
1427 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001428 if (find_maxchar_surrogates(u, u + size,
1429 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return NULL;
1431
1432 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1433 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434 if (!unicode)
1435 return NULL;
1436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 switch (PyUnicode_KIND(unicode)) {
1438 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001439 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1441 break;
1442 case PyUnicode_2BYTE_KIND:
1443#if Py_UNICODE_SIZE == 2
1444 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1445#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001446 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1448#endif
1449 break;
1450 case PyUnicode_4BYTE_KIND:
1451#if SIZEOF_WCHAR_T == 2
1452 /* This is the only case which has to process surrogates, thus
1453 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001454 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455#else
1456 assert(num_surrogates == 0);
1457 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1458#endif
1459 break;
1460 default:
1461 assert(0 && "Impossible state");
1462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463
1464 return (PyObject *)unicode;
1465}
1466
Alexander Belopolsky40018472011-02-26 01:02:56 +00001467PyObject *
1468PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001469{
1470 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001471
Benjamin Peterson14339b62009-01-31 16:36:08 +00001472 if (size < 0) {
1473 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001474 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001475 return NULL;
1476 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001477
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001478 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001479 some optimizations which share commonly used objects.
1480 Also, this means the input must be UTF-8, so fall back to the
1481 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001482 if (u != NULL) {
1483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 /* Optimization for empty strings */
1485 if (size == 0 && unicode_empty != NULL) {
1486 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001489
1490 /* Single characters are shared when using this constructor.
1491 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 if (size == 1 && Py_CHARMASK(*u) < 128)
1493 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001494
1495 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001496 }
1497
Walter Dörwald55507312007-05-18 13:12:10 +00001498 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001499 if (!unicode)
1500 return NULL;
1501
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001502 return (PyObject *)unicode;
1503}
1504
Alexander Belopolsky40018472011-02-26 01:02:56 +00001505PyObject *
1506PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001507{
1508 size_t size = strlen(u);
1509 if (size > PY_SSIZE_T_MAX) {
1510 PyErr_SetString(PyExc_OverflowError, "input too long");
1511 return NULL;
1512 }
1513
1514 return PyUnicode_FromStringAndSize(u, size);
1515}
1516
Victor Stinnere57b1c02011-09-28 22:20:48 +02001517static PyObject*
Victor Stinner702c7342011-10-05 13:50:52 +02001518unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1519{
1520 PyObject *res = PyUnicode_New(size, 127);
1521 if (!res)
1522 return NULL;
1523 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1524 return res;
1525}
1526
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001527static Py_UCS4
1528kind_maxchar_limit(unsigned int kind)
1529{
1530 switch(kind) {
1531 case PyUnicode_1BYTE_KIND:
1532 return 0x80;
1533 case PyUnicode_2BYTE_KIND:
1534 return 0x100;
1535 case PyUnicode_4BYTE_KIND:
1536 return 0x10000;
1537 default:
1538 assert(0 && "invalid kind");
1539 return 0x10ffff;
1540 }
1541}
1542
Victor Stinner702c7342011-10-05 13:50:52 +02001543static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001544_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001547 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001549
1550 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 for (i = 0; i < size; i++) {
1552 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001553 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001555 }
1556 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001557 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 if (!res)
1559 return NULL;
1560 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1561 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001562}
1563
Victor Stinnere57b1c02011-09-28 22:20:48 +02001564static PyObject*
1565_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566{
1567 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001568 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001570
1571 assert(size >= 0);
1572 for (i = 0; i < size; i++) {
1573 if (u[i] > max_char) {
1574 max_char = u[i];
1575 if (max_char >= 256)
1576 break;
1577 }
1578 }
1579 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 if (!res)
1581 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001582 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1584 else
1585 for (i = 0; i < size; i++)
1586 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1587 return res;
1588}
1589
Victor Stinnere57b1c02011-09-28 22:20:48 +02001590static PyObject*
1591_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592{
1593 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001594 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001596
1597 assert(size >= 0);
1598 for (i = 0; i < size; i++) {
1599 if (u[i] > max_char) {
1600 max_char = u[i];
1601 if (max_char >= 0x10000)
1602 break;
1603 }
1604 }
1605 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606 if (!res)
1607 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001608 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001609 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1610 else {
1611 int kind = PyUnicode_KIND(res);
1612 void *data = PyUnicode_DATA(res);
1613 for (i = 0; i < size; i++)
1614 PyUnicode_WRITE(kind, data, i, u[i]);
1615 }
1616 return res;
1617}
1618
1619PyObject*
1620PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1621{
1622 switch(kind) {
1623 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001624 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001625 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001626 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001628 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001629 default:
1630 assert(0 && "invalid kind");
1631 PyErr_SetString(PyExc_SystemError, "invalid kind");
1632 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634}
1635
Victor Stinner034f6cf2011-09-30 02:26:44 +02001636PyObject*
1637PyUnicode_Copy(PyObject *unicode)
1638{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001639 Py_ssize_t size;
1640 PyObject *copy;
1641 void *data;
1642
Victor Stinner034f6cf2011-09-30 02:26:44 +02001643 if (!PyUnicode_Check(unicode)) {
1644 PyErr_BadInternalCall();
1645 return NULL;
1646 }
1647 if (PyUnicode_READY(unicode))
1648 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001649
1650 size = PyUnicode_GET_LENGTH(unicode);
1651 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1652 if (!copy)
1653 return NULL;
1654 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1655
1656 data = PyUnicode_DATA(unicode);
1657 switch (PyUnicode_KIND(unicode))
1658 {
1659 case PyUnicode_1BYTE_KIND:
1660 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1661 break;
1662 case PyUnicode_2BYTE_KIND:
1663 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1664 break;
1665 case PyUnicode_4BYTE_KIND:
1666 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1667 break;
1668 default:
1669 assert(0);
1670 break;
1671 }
1672 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001673}
1674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675
Victor Stinnerbc603d12011-10-02 01:00:40 +02001676/* Widen Unicode objects to larger buffers. Don't write terminating null
1677 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678
1679void*
1680_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1681{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001682 Py_ssize_t len;
1683 void *result;
1684 unsigned int skind;
1685
1686 if (PyUnicode_READY(s))
1687 return NULL;
1688
1689 len = PyUnicode_GET_LENGTH(s);
1690 skind = PyUnicode_KIND(s);
1691 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001692 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 return NULL;
1694 }
1695 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001696 case PyUnicode_2BYTE_KIND:
1697 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1698 if (!result)
1699 return PyErr_NoMemory();
1700 assert(skind == PyUnicode_1BYTE_KIND);
1701 _PyUnicode_CONVERT_BYTES(
1702 Py_UCS1, Py_UCS2,
1703 PyUnicode_1BYTE_DATA(s),
1704 PyUnicode_1BYTE_DATA(s) + len,
1705 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001707 case PyUnicode_4BYTE_KIND:
1708 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1709 if (!result)
1710 return PyErr_NoMemory();
1711 if (skind == PyUnicode_2BYTE_KIND) {
1712 _PyUnicode_CONVERT_BYTES(
1713 Py_UCS2, Py_UCS4,
1714 PyUnicode_2BYTE_DATA(s),
1715 PyUnicode_2BYTE_DATA(s) + len,
1716 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001718 else {
1719 assert(skind == PyUnicode_1BYTE_KIND);
1720 _PyUnicode_CONVERT_BYTES(
1721 Py_UCS1, Py_UCS4,
1722 PyUnicode_1BYTE_DATA(s),
1723 PyUnicode_1BYTE_DATA(s) + len,
1724 result);
1725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001727 default:
1728 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 }
Victor Stinner01698042011-10-04 00:04:26 +02001730 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 return NULL;
1732}
1733
1734static Py_UCS4*
1735as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1736 int copy_null)
1737{
1738 int kind;
1739 void *data;
1740 Py_ssize_t len, targetlen;
1741 if (PyUnicode_READY(string) == -1)
1742 return NULL;
1743 kind = PyUnicode_KIND(string);
1744 data = PyUnicode_DATA(string);
1745 len = PyUnicode_GET_LENGTH(string);
1746 targetlen = len;
1747 if (copy_null)
1748 targetlen++;
1749 if (!target) {
1750 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1751 PyErr_NoMemory();
1752 return NULL;
1753 }
1754 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1755 if (!target) {
1756 PyErr_NoMemory();
1757 return NULL;
1758 }
1759 }
1760 else {
1761 if (targetsize < targetlen) {
1762 PyErr_Format(PyExc_SystemError,
1763 "string is longer than the buffer");
1764 if (copy_null && 0 < targetsize)
1765 target[0] = 0;
1766 return NULL;
1767 }
1768 }
1769 if (kind != PyUnicode_4BYTE_KIND) {
1770 Py_ssize_t i;
1771 for (i = 0; i < len; i++)
1772 target[i] = PyUnicode_READ(kind, data, i);
1773 }
1774 else
1775 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1776 if (copy_null)
1777 target[len] = 0;
1778 return target;
1779}
1780
1781Py_UCS4*
1782PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1783 int copy_null)
1784{
1785 if (target == NULL || targetsize < 1) {
1786 PyErr_BadInternalCall();
1787 return NULL;
1788 }
1789 return as_ucs4(string, target, targetsize, copy_null);
1790}
1791
1792Py_UCS4*
1793PyUnicode_AsUCS4Copy(PyObject *string)
1794{
1795 return as_ucs4(string, NULL, 0, 1);
1796}
1797
1798#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001799
Alexander Belopolsky40018472011-02-26 01:02:56 +00001800PyObject *
1801PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001804 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001806 PyErr_BadInternalCall();
1807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 }
1809
Martin v. Löwis790465f2008-04-05 20:41:37 +00001810 if (size == -1) {
1811 size = wcslen(w);
1812 }
1813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815}
1816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001818
Walter Dörwald346737f2007-05-31 10:44:43 +00001819static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001820makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1821 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001822{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001823 *fmt++ = '%';
1824 if (width) {
1825 if (zeropad)
1826 *fmt++ = '0';
1827 fmt += sprintf(fmt, "%d", width);
1828 }
1829 if (precision)
1830 fmt += sprintf(fmt, ".%d", precision);
1831 if (longflag)
1832 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001833 else if (longlongflag) {
1834 /* longlongflag should only ever be nonzero on machines with
1835 HAVE_LONG_LONG defined */
1836#ifdef HAVE_LONG_LONG
1837 char *f = PY_FORMAT_LONG_LONG;
1838 while (*f)
1839 *fmt++ = *f++;
1840#else
1841 /* we shouldn't ever get here */
1842 assert(0);
1843 *fmt++ = 'l';
1844#endif
1845 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001846 else if (size_tflag) {
1847 char *f = PY_FORMAT_SIZE_T;
1848 while (*f)
1849 *fmt++ = *f++;
1850 }
1851 *fmt++ = c;
1852 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001853}
1854
Victor Stinner96865452011-03-01 23:44:09 +00001855/* helper for PyUnicode_FromFormatV() */
1856
1857static const char*
1858parse_format_flags(const char *f,
1859 int *p_width, int *p_precision,
1860 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1861{
1862 int width, precision, longflag, longlongflag, size_tflag;
1863
1864 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1865 f++;
1866 width = 0;
1867 while (Py_ISDIGIT((unsigned)*f))
1868 width = (width*10) + *f++ - '0';
1869 precision = 0;
1870 if (*f == '.') {
1871 f++;
1872 while (Py_ISDIGIT((unsigned)*f))
1873 precision = (precision*10) + *f++ - '0';
1874 if (*f == '%') {
1875 /* "%.3%s" => f points to "3" */
1876 f--;
1877 }
1878 }
1879 if (*f == '\0') {
1880 /* bogus format "%.1" => go backward, f points to "1" */
1881 f--;
1882 }
1883 if (p_width != NULL)
1884 *p_width = width;
1885 if (p_precision != NULL)
1886 *p_precision = precision;
1887
1888 /* Handle %ld, %lu, %lld and %llu. */
1889 longflag = 0;
1890 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001891 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001892
1893 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001894 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001895 longflag = 1;
1896 ++f;
1897 }
1898#ifdef HAVE_LONG_LONG
1899 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001900 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001901 longlongflag = 1;
1902 f += 2;
1903 }
1904#endif
1905 }
1906 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001907 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001908 size_tflag = 1;
1909 ++f;
1910 }
1911 if (p_longflag != NULL)
1912 *p_longflag = longflag;
1913 if (p_longlongflag != NULL)
1914 *p_longlongflag = longlongflag;
1915 if (p_size_tflag != NULL)
1916 *p_size_tflag = size_tflag;
1917 return f;
1918}
1919
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001920/* maximum number of characters required for output of %ld. 21 characters
1921 allows for 64-bit integers (in decimal) and an optional sign. */
1922#define MAX_LONG_CHARS 21
1923/* maximum number of characters required for output of %lld.
1924 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1925 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1926#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1927
Walter Dörwaldd2034312007-05-18 16:29:38 +00001928PyObject *
1929PyUnicode_FromFormatV(const char *format, va_list vargs)
1930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001931 va_list count;
1932 Py_ssize_t callcount = 0;
1933 PyObject **callresults = NULL;
1934 PyObject **callresult = NULL;
1935 Py_ssize_t n = 0;
1936 int width = 0;
1937 int precision = 0;
1938 int zeropad;
1939 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001941 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001942 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1944 Py_UCS4 argmaxchar;
1945 Py_ssize_t numbersize = 0;
1946 char *numberresults = NULL;
1947 char *numberresult = NULL;
1948 Py_ssize_t i;
1949 int kind;
1950 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001951
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001952 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001953 /* step 1: count the number of %S/%R/%A/%s format specifications
1954 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1955 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02001957 * also estimate a upper bound for all the number formats in the string,
1958 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001960 for (f = format; *f; f++) {
1961 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001962 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1964 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1965 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1966 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001969#ifdef HAVE_LONG_LONG
1970 if (longlongflag) {
1971 if (width < MAX_LONG_LONG_CHARS)
1972 width = MAX_LONG_LONG_CHARS;
1973 }
1974 else
1975#endif
1976 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1977 including sign. Decimal takes the most space. This
1978 isn't enough for octal. If a width is specified we
1979 need more (which we allocate later). */
1980 if (width < MAX_LONG_CHARS)
1981 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982
1983 /* account for the size + '\0' to separate numbers
1984 inside of the numberresults buffer */
1985 numbersize += (width + 1);
1986 }
1987 }
1988 else if ((unsigned char)*f > 127) {
1989 PyErr_Format(PyExc_ValueError,
1990 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1991 "string, got a non-ASCII byte: 0x%02x",
1992 (unsigned char)*f);
1993 return NULL;
1994 }
1995 }
1996 /* step 2: allocate memory for the results of
1997 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1998 if (callcount) {
1999 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2000 if (!callresults) {
2001 PyErr_NoMemory();
2002 return NULL;
2003 }
2004 callresult = callresults;
2005 }
2006 /* step 2.5: allocate memory for the results of formating numbers */
2007 if (numbersize) {
2008 numberresults = PyObject_Malloc(numbersize);
2009 if (!numberresults) {
2010 PyErr_NoMemory();
2011 goto fail;
2012 }
2013 numberresult = numberresults;
2014 }
2015
2016 /* step 3: format numbers and figure out how large a buffer we need */
2017 for (f = format; *f; f++) {
2018 if (*f == '%') {
2019 const char* p;
2020 int longflag;
2021 int longlongflag;
2022 int size_tflag;
2023 int numprinted;
2024
2025 p = f;
2026 zeropad = (f[1] == '0');
2027 f = parse_format_flags(f, &width, &precision,
2028 &longflag, &longlongflag, &size_tflag);
2029 switch (*f) {
2030 case 'c':
2031 {
2032 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002033 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 n++;
2035 break;
2036 }
2037 case '%':
2038 n++;
2039 break;
2040 case 'i':
2041 case 'd':
2042 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2043 width, precision, *f);
2044 if (longflag)
2045 numprinted = sprintf(numberresult, fmt,
2046 va_arg(count, long));
2047#ifdef HAVE_LONG_LONG
2048 else if (longlongflag)
2049 numprinted = sprintf(numberresult, fmt,
2050 va_arg(count, PY_LONG_LONG));
2051#endif
2052 else if (size_tflag)
2053 numprinted = sprintf(numberresult, fmt,
2054 va_arg(count, Py_ssize_t));
2055 else
2056 numprinted = sprintf(numberresult, fmt,
2057 va_arg(count, int));
2058 n += numprinted;
2059 /* advance by +1 to skip over the '\0' */
2060 numberresult += (numprinted + 1);
2061 assert(*(numberresult - 1) == '\0');
2062 assert(*(numberresult - 2) != '\0');
2063 assert(numprinted >= 0);
2064 assert(numberresult <= numberresults + numbersize);
2065 break;
2066 case 'u':
2067 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2068 width, precision, 'u');
2069 if (longflag)
2070 numprinted = sprintf(numberresult, fmt,
2071 va_arg(count, unsigned long));
2072#ifdef HAVE_LONG_LONG
2073 else if (longlongflag)
2074 numprinted = sprintf(numberresult, fmt,
2075 va_arg(count, unsigned PY_LONG_LONG));
2076#endif
2077 else if (size_tflag)
2078 numprinted = sprintf(numberresult, fmt,
2079 va_arg(count, size_t));
2080 else
2081 numprinted = sprintf(numberresult, fmt,
2082 va_arg(count, unsigned int));
2083 n += numprinted;
2084 numberresult += (numprinted + 1);
2085 assert(*(numberresult - 1) == '\0');
2086 assert(*(numberresult - 2) != '\0');
2087 assert(numprinted >= 0);
2088 assert(numberresult <= numberresults + numbersize);
2089 break;
2090 case 'x':
2091 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2092 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2093 n += numprinted;
2094 numberresult += (numprinted + 1);
2095 assert(*(numberresult - 1) == '\0');
2096 assert(*(numberresult - 2) != '\0');
2097 assert(numprinted >= 0);
2098 assert(numberresult <= numberresults + numbersize);
2099 break;
2100 case 'p':
2101 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2102 /* %p is ill-defined: ensure leading 0x. */
2103 if (numberresult[1] == 'X')
2104 numberresult[1] = 'x';
2105 else if (numberresult[1] != 'x') {
2106 memmove(numberresult + 2, numberresult,
2107 strlen(numberresult) + 1);
2108 numberresult[0] = '0';
2109 numberresult[1] = 'x';
2110 numprinted += 2;
2111 }
2112 n += numprinted;
2113 numberresult += (numprinted + 1);
2114 assert(*(numberresult - 1) == '\0');
2115 assert(*(numberresult - 2) != '\0');
2116 assert(numprinted >= 0);
2117 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002118 break;
2119 case 's':
2120 {
2121 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002122 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002123 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2124 if (!str)
2125 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126 /* since PyUnicode_DecodeUTF8 returns already flexible
2127 unicode objects, there is no need to call ready on them */
2128 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002129 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002131 /* Remember the str and switch to the next slot */
2132 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002133 break;
2134 }
2135 case 'U':
2136 {
2137 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002138 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 if (PyUnicode_READY(obj) == -1)
2140 goto fail;
2141 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002142 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 break;
2145 }
2146 case 'V':
2147 {
2148 PyObject *obj = va_arg(count, PyObject *);
2149 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002150 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002151 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002152 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002153 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154 if (PyUnicode_READY(obj) == -1)
2155 goto fail;
2156 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002157 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002159 *callresult++ = NULL;
2160 }
2161 else {
2162 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2163 if (!str_obj)
2164 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002165 if (PyUnicode_READY(str_obj)) {
2166 Py_DECREF(str_obj);
2167 goto fail;
2168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002170 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002172 *callresult++ = str_obj;
2173 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 break;
2175 }
2176 case 'S':
2177 {
2178 PyObject *obj = va_arg(count, PyObject *);
2179 PyObject *str;
2180 assert(obj);
2181 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002185 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002187 /* Remember the str and switch to the next slot */
2188 *callresult++ = str;
2189 break;
2190 }
2191 case 'R':
2192 {
2193 PyObject *obj = va_arg(count, PyObject *);
2194 PyObject *repr;
2195 assert(obj);
2196 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002200 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 /* Remember the repr and switch to the next slot */
2203 *callresult++ = repr;
2204 break;
2205 }
2206 case 'A':
2207 {
2208 PyObject *obj = va_arg(count, PyObject *);
2209 PyObject *ascii;
2210 assert(obj);
2211 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002213 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002215 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 /* Remember the repr and switch to the next slot */
2218 *callresult++ = ascii;
2219 break;
2220 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 default:
2222 /* if we stumble upon an unknown
2223 formatting code, copy the rest of
2224 the format string to the output
2225 string. (we cannot just skip the
2226 code, since there's no way to know
2227 what's in the argument list) */
2228 n += strlen(p);
2229 goto expand;
2230 }
2231 } else
2232 n++;
2233 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002234 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002235 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002237 we don't have to resize the string.
2238 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 if (!string)
2241 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 kind = PyUnicode_KIND(string);
2243 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002248 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002249 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002250
2251 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2253 /* checking for == because the last argument could be a empty
2254 string, which causes i to point to end, the assert at the end of
2255 the loop */
2256 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002257
Benjamin Peterson14339b62009-01-31 16:36:08 +00002258 switch (*f) {
2259 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002260 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 const int ordinal = va_arg(vargs, int);
2262 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002263 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002264 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002265 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002268 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 case 'p':
2270 /* unused, since we already have the result */
2271 if (*f == 'p')
2272 (void) va_arg(vargs, void *);
2273 else
2274 (void) va_arg(vargs, int);
2275 /* extract the result from numberresults and append. */
2276 for (; *numberresult; ++i, ++numberresult)
2277 PyUnicode_WRITE(kind, data, i, *numberresult);
2278 /* skip over the separating '\0' */
2279 assert(*numberresult == '\0');
2280 numberresult++;
2281 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 break;
2283 case 's':
2284 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002285 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002287 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 size = PyUnicode_GET_LENGTH(*callresult);
2289 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002290 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2291 *callresult, 0,
2292 size) < 0)
2293 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002295 /* We're done with the unicode()/repr() => forget it */
2296 Py_DECREF(*callresult);
2297 /* switch to next unicode()/repr() result */
2298 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 break;
2300 }
2301 case 'U':
2302 {
2303 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002304 Py_ssize_t size;
2305 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2306 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002307 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2308 obj, 0,
2309 size) < 0)
2310 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002312 break;
2313 }
2314 case 'V':
2315 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002316 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002317 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002318 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 size = PyUnicode_GET_LENGTH(obj);
2321 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002322 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2323 obj, 0,
2324 size) < 0)
2325 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002326 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002327 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 size = PyUnicode_GET_LENGTH(*callresult);
2329 assert(PyUnicode_KIND(*callresult) <=
2330 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002331 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2332 *callresult,
2333 0, size) < 0)
2334 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002335 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002336 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002337 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002338 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002339 break;
2340 }
2341 case 'S':
2342 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002343 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002344 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002345 /* unused, since we already have the result */
2346 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002347 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002348 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2349 *callresult, 0,
2350 PyUnicode_GET_LENGTH(*callresult)) < 0)
2351 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 /* We're done with the unicode()/repr() => forget it */
2354 Py_DECREF(*callresult);
2355 /* switch to next unicode()/repr() result */
2356 ++callresult;
2357 break;
2358 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002359 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 break;
2362 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363 for (; *p; ++p, ++i)
2364 PyUnicode_WRITE(kind, data, i, *p);
2365 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002366 goto end;
2367 }
Victor Stinner1205f272010-09-11 00:54:47 +00002368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 else {
2370 assert(i < PyUnicode_GET_LENGTH(string));
2371 PyUnicode_WRITE(kind, data, i++, *f);
2372 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002375
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002377 if (callresults)
2378 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 if (numberresults)
2380 PyObject_Free(numberresults);
2381 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002382 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002383 if (callresults) {
2384 PyObject **callresult2 = callresults;
2385 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002386 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 ++callresult2;
2388 }
2389 PyObject_Free(callresults);
2390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 if (numberresults)
2392 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002393 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002394}
2395
Walter Dörwaldd2034312007-05-18 16:29:38 +00002396PyObject *
2397PyUnicode_FromFormat(const char *format, ...)
2398{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002399 PyObject* ret;
2400 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002401
2402#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002404#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002405 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002406#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 ret = PyUnicode_FromFormatV(format, vargs);
2408 va_end(vargs);
2409 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002410}
2411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412#ifdef HAVE_WCHAR_H
2413
Victor Stinner5593d8a2010-10-02 11:11:27 +00002414/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2415 convert a Unicode object to a wide character string.
2416
Victor Stinnerd88d9832011-09-06 02:00:05 +02002417 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002418 character) required to convert the unicode object. Ignore size argument.
2419
Victor Stinnerd88d9832011-09-06 02:00:05 +02002420 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002421 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002422 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002423static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002424unicode_aswidechar(PyUnicodeObject *unicode,
2425 wchar_t *w,
2426 Py_ssize_t size)
2427{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002428 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 const wchar_t *wstr;
2430
2431 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2432 if (wstr == NULL)
2433 return -1;
2434
Victor Stinner5593d8a2010-10-02 11:11:27 +00002435 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002436 if (size > res)
2437 size = res + 1;
2438 else
2439 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002441 return res;
2442 }
2443 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002445}
2446
2447Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002448PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002449 wchar_t *w,
2450 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451{
2452 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 PyErr_BadInternalCall();
2454 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002456 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457}
2458
Victor Stinner137c34c2010-09-29 10:25:54 +00002459wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002460PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002461 Py_ssize_t *size)
2462{
2463 wchar_t* buffer;
2464 Py_ssize_t buflen;
2465
2466 if (unicode == NULL) {
2467 PyErr_BadInternalCall();
2468 return NULL;
2469 }
2470
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002471 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 if (buflen == -1)
2473 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002474 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002475 PyErr_NoMemory();
2476 return NULL;
2477 }
2478
Victor Stinner137c34c2010-09-29 10:25:54 +00002479 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2480 if (buffer == NULL) {
2481 PyErr_NoMemory();
2482 return NULL;
2483 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002484 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 if (buflen == -1)
2486 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002487 if (size != NULL)
2488 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002489 return buffer;
2490}
2491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493
Alexander Belopolsky40018472011-02-26 01:02:56 +00002494PyObject *
2495PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002498 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002499 PyErr_SetString(PyExc_ValueError,
2500 "chr() arg not in range(0x110000)");
2501 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002502 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 if (ordinal < 256)
2505 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 v = PyUnicode_New(1, ordinal);
2508 if (v == NULL)
2509 return NULL;
2510 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2511 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002512}
2513
Alexander Belopolsky40018472011-02-26 01:02:56 +00002514PyObject *
2515PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002517 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002519 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002520 if (PyUnicode_READY(obj))
2521 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002522 Py_INCREF(obj);
2523 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002524 }
2525 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002526 /* For a Unicode subtype that's not a Unicode object,
2527 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002528 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002529 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002530 PyErr_Format(PyExc_TypeError,
2531 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002532 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002533 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002534}
2535
Alexander Belopolsky40018472011-02-26 01:02:56 +00002536PyObject *
2537PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002538 const char *encoding,
2539 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002540{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002541 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002542 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002543
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002545 PyErr_BadInternalCall();
2546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002548
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002549 /* Decoding bytes objects is the most common case and should be fast */
2550 if (PyBytes_Check(obj)) {
2551 if (PyBytes_GET_SIZE(obj) == 0) {
2552 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002553 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002554 }
2555 else {
2556 v = PyUnicode_Decode(
2557 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2558 encoding, errors);
2559 }
2560 return v;
2561 }
2562
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002563 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002564 PyErr_SetString(PyExc_TypeError,
2565 "decoding str is not supported");
2566 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002568
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002569 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2570 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2571 PyErr_Format(PyExc_TypeError,
2572 "coercing to str: need bytes, bytearray "
2573 "or buffer-like object, %.80s found",
2574 Py_TYPE(obj)->tp_name);
2575 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002576 }
Tim Petersced69f82003-09-16 20:30:58 +00002577
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002578 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002579 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002580 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 }
Tim Petersced69f82003-09-16 20:30:58 +00002582 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002583 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002584
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002585 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002586 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587}
2588
Victor Stinner600d3be2010-06-10 12:00:55 +00002589/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002590 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2591 1 on success. */
2592static int
2593normalize_encoding(const char *encoding,
2594 char *lower,
2595 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002597 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002598 char *l;
2599 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002601 e = encoding;
2602 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002603 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002604 while (*e) {
2605 if (l == l_end)
2606 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002607 if (Py_ISUPPER(*e)) {
2608 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002609 }
2610 else if (*e == '_') {
2611 *l++ = '-';
2612 e++;
2613 }
2614 else {
2615 *l++ = *e++;
2616 }
2617 }
2618 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002619 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002620}
2621
Alexander Belopolsky40018472011-02-26 01:02:56 +00002622PyObject *
2623PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002624 Py_ssize_t size,
2625 const char *encoding,
2626 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002627{
2628 PyObject *buffer = NULL, *unicode;
2629 Py_buffer info;
2630 char lower[11]; /* Enough for any encoding shortcut */
2631
2632 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002633 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002634
2635 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002636 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002637 if ((strcmp(lower, "utf-8") == 0) ||
2638 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002639 return PyUnicode_DecodeUTF8(s, size, errors);
2640 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002641 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002642 (strcmp(lower, "iso-8859-1") == 0))
2643 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002644#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002645 else if (strcmp(lower, "mbcs") == 0)
2646 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002647#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002648 else if (strcmp(lower, "ascii") == 0)
2649 return PyUnicode_DecodeASCII(s, size, errors);
2650 else if (strcmp(lower, "utf-16") == 0)
2651 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2652 else if (strcmp(lower, "utf-32") == 0)
2653 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655
2656 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002657 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002658 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002659 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002660 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 if (buffer == NULL)
2662 goto onError;
2663 unicode = PyCodec_Decode(buffer, encoding, errors);
2664 if (unicode == NULL)
2665 goto onError;
2666 if (!PyUnicode_Check(unicode)) {
2667 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002668 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002669 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 Py_DECREF(unicode);
2671 goto onError;
2672 }
2673 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002674#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002675 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 Py_DECREF(unicode);
2677 return NULL;
2678 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002679#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002681
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 Py_XDECREF(buffer);
2684 return NULL;
2685}
2686
Alexander Belopolsky40018472011-02-26 01:02:56 +00002687PyObject *
2688PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002689 const char *encoding,
2690 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002691{
2692 PyObject *v;
2693
2694 if (!PyUnicode_Check(unicode)) {
2695 PyErr_BadArgument();
2696 goto onError;
2697 }
2698
2699 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002701
2702 /* Decode via the codec registry */
2703 v = PyCodec_Decode(unicode, encoding, errors);
2704 if (v == NULL)
2705 goto onError;
2706 return v;
2707
Benjamin Peterson29060642009-01-31 22:14:21 +00002708 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002709 return NULL;
2710}
2711
Alexander Belopolsky40018472011-02-26 01:02:56 +00002712PyObject *
2713PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002714 const char *encoding,
2715 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002716{
2717 PyObject *v;
2718
2719 if (!PyUnicode_Check(unicode)) {
2720 PyErr_BadArgument();
2721 goto onError;
2722 }
2723
2724 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002725 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002726
2727 /* Decode via the codec registry */
2728 v = PyCodec_Decode(unicode, encoding, errors);
2729 if (v == NULL)
2730 goto onError;
2731 if (!PyUnicode_Check(v)) {
2732 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002733 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002734 Py_TYPE(v)->tp_name);
2735 Py_DECREF(v);
2736 goto onError;
2737 }
2738 return v;
2739
Benjamin Peterson29060642009-01-31 22:14:21 +00002740 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002741 return NULL;
2742}
2743
Alexander Belopolsky40018472011-02-26 01:02:56 +00002744PyObject *
2745PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002746 Py_ssize_t size,
2747 const char *encoding,
2748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749{
2750 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002751
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 unicode = PyUnicode_FromUnicode(s, size);
2753 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002754 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2756 Py_DECREF(unicode);
2757 return v;
2758}
2759
Alexander Belopolsky40018472011-02-26 01:02:56 +00002760PyObject *
2761PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002762 const char *encoding,
2763 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002764{
2765 PyObject *v;
2766
2767 if (!PyUnicode_Check(unicode)) {
2768 PyErr_BadArgument();
2769 goto onError;
2770 }
2771
2772 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002773 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002774
2775 /* Encode via the codec registry */
2776 v = PyCodec_Encode(unicode, encoding, errors);
2777 if (v == NULL)
2778 goto onError;
2779 return v;
2780
Benjamin Peterson29060642009-01-31 22:14:21 +00002781 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002782 return NULL;
2783}
2784
Victor Stinnerad158722010-10-27 00:25:46 +00002785PyObject *
2786PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002787{
Victor Stinner99b95382011-07-04 14:23:54 +02002788#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002789 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2790 PyUnicode_GET_SIZE(unicode),
2791 NULL);
2792#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002794#else
Victor Stinner793b5312011-04-27 00:24:21 +02002795 PyInterpreterState *interp = PyThreadState_GET()->interp;
2796 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2797 cannot use it to encode and decode filenames before it is loaded. Load
2798 the Python codec requires to encode at least its own filename. Use the C
2799 version of the locale codec until the codec registry is initialized and
2800 the Python codec is loaded.
2801
2802 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2803 cannot only rely on it: check also interp->fscodec_initialized for
2804 subinterpreters. */
2805 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002806 return PyUnicode_AsEncodedString(unicode,
2807 Py_FileSystemDefaultEncoding,
2808 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002809 }
2810 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002811 /* locale encoding with surrogateescape */
2812 wchar_t *wchar;
2813 char *bytes;
2814 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002815 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002816
2817 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2818 if (wchar == NULL)
2819 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002820 bytes = _Py_wchar2char(wchar, &error_pos);
2821 if (bytes == NULL) {
2822 if (error_pos != (size_t)-1) {
2823 char *errmsg = strerror(errno);
2824 PyObject *exc = NULL;
2825 if (errmsg == NULL)
2826 errmsg = "Py_wchar2char() failed";
2827 raise_encode_exception(&exc,
2828 "filesystemencoding",
2829 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2830 error_pos, error_pos+1,
2831 errmsg);
2832 Py_XDECREF(exc);
2833 }
2834 else
2835 PyErr_NoMemory();
2836 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002837 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002838 }
2839 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002840
2841 bytes_obj = PyBytes_FromString(bytes);
2842 PyMem_Free(bytes);
2843 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002844 }
Victor Stinnerad158722010-10-27 00:25:46 +00002845#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002846}
2847
Alexander Belopolsky40018472011-02-26 01:02:56 +00002848PyObject *
2849PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002850 const char *encoding,
2851 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852{
2853 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002854 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002855
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 if (!PyUnicode_Check(unicode)) {
2857 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 }
Fred Drakee4315f52000-05-09 19:53:39 +00002860
Victor Stinner2f283c22011-03-02 01:21:46 +00002861 if (encoding == NULL) {
2862 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002863 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002864 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002865 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002866 }
Fred Drakee4315f52000-05-09 19:53:39 +00002867
2868 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002869 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002870 if ((strcmp(lower, "utf-8") == 0) ||
2871 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002872 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002873 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002874 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002875 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002876 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002877 }
Victor Stinner37296e82010-06-10 13:36:23 +00002878 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002879 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002880 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002881 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002882#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002883 else if (strcmp(lower, "mbcs") == 0)
2884 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2885 PyUnicode_GET_SIZE(unicode),
2886 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002887#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002888 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002889 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891
2892 /* Encode via the codec registry */
2893 v = PyCodec_Encode(unicode, encoding, errors);
2894 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002895 return NULL;
2896
2897 /* The normal path */
2898 if (PyBytes_Check(v))
2899 return v;
2900
2901 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002902 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002903 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002904 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002905
2906 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2907 "encoder %s returned bytearray instead of bytes",
2908 encoding);
2909 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002910 Py_DECREF(v);
2911 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002912 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002913
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002914 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2915 Py_DECREF(v);
2916 return b;
2917 }
2918
2919 PyErr_Format(PyExc_TypeError,
2920 "encoder did not return a bytes object (type=%.400s)",
2921 Py_TYPE(v)->tp_name);
2922 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002923 return NULL;
2924}
2925
Alexander Belopolsky40018472011-02-26 01:02:56 +00002926PyObject *
2927PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002928 const char *encoding,
2929 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002930{
2931 PyObject *v;
2932
2933 if (!PyUnicode_Check(unicode)) {
2934 PyErr_BadArgument();
2935 goto onError;
2936 }
2937
2938 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002939 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002940
2941 /* Encode via the codec registry */
2942 v = PyCodec_Encode(unicode, encoding, errors);
2943 if (v == NULL)
2944 goto onError;
2945 if (!PyUnicode_Check(v)) {
2946 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002947 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002948 Py_TYPE(v)->tp_name);
2949 Py_DECREF(v);
2950 goto onError;
2951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002953
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 return NULL;
2956}
2957
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002958PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002959PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002960 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002961 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2962}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002963
Christian Heimes5894ba72007-11-04 11:43:14 +00002964PyObject*
2965PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2966{
Victor Stinner99b95382011-07-04 14:23:54 +02002967#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002968 return PyUnicode_DecodeMBCS(s, size, NULL);
2969#elif defined(__APPLE__)
2970 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2971#else
Victor Stinner793b5312011-04-27 00:24:21 +02002972 PyInterpreterState *interp = PyThreadState_GET()->interp;
2973 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2974 cannot use it to encode and decode filenames before it is loaded. Load
2975 the Python codec requires to encode at least its own filename. Use the C
2976 version of the locale codec until the codec registry is initialized and
2977 the Python codec is loaded.
2978
2979 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2980 cannot only rely on it: check also interp->fscodec_initialized for
2981 subinterpreters. */
2982 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002983 return PyUnicode_Decode(s, size,
2984 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002985 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002986 }
2987 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002988 /* locale encoding with surrogateescape */
2989 wchar_t *wchar;
2990 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002991 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002992
2993 if (s[size] != '\0' || size != strlen(s)) {
2994 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2995 return NULL;
2996 }
2997
Victor Stinner168e1172010-10-16 23:16:16 +00002998 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002999 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003000 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003001
Victor Stinner168e1172010-10-16 23:16:16 +00003002 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003003 PyMem_Free(wchar);
3004 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003005 }
Victor Stinnerad158722010-10-27 00:25:46 +00003006#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003007}
3008
Martin v. Löwis011e8422009-05-05 04:43:17 +00003009
3010int
3011PyUnicode_FSConverter(PyObject* arg, void* addr)
3012{
3013 PyObject *output = NULL;
3014 Py_ssize_t size;
3015 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003016 if (arg == NULL) {
3017 Py_DECREF(*(PyObject**)addr);
3018 return 1;
3019 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003020 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003021 output = arg;
3022 Py_INCREF(output);
3023 }
3024 else {
3025 arg = PyUnicode_FromObject(arg);
3026 if (!arg)
3027 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003028 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003029 Py_DECREF(arg);
3030 if (!output)
3031 return 0;
3032 if (!PyBytes_Check(output)) {
3033 Py_DECREF(output);
3034 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3035 return 0;
3036 }
3037 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003038 size = PyBytes_GET_SIZE(output);
3039 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003040 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003041 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003042 Py_DECREF(output);
3043 return 0;
3044 }
3045 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003046 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003047}
3048
3049
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003050int
3051PyUnicode_FSDecoder(PyObject* arg, void* addr)
3052{
3053 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003054 if (arg == NULL) {
3055 Py_DECREF(*(PyObject**)addr);
3056 return 1;
3057 }
3058 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059 if (PyUnicode_READY(arg))
3060 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003061 output = arg;
3062 Py_INCREF(output);
3063 }
3064 else {
3065 arg = PyBytes_FromObject(arg);
3066 if (!arg)
3067 return 0;
3068 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3069 PyBytes_GET_SIZE(arg));
3070 Py_DECREF(arg);
3071 if (!output)
3072 return 0;
3073 if (!PyUnicode_Check(output)) {
3074 Py_DECREF(output);
3075 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3076 return 0;
3077 }
3078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003079 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3080 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003081 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3082 Py_DECREF(output);
3083 return 0;
3084 }
3085 *(PyObject**)addr = output;
3086 return Py_CLEANUP_SUPPORTED;
3087}
3088
3089
Martin v. Löwis5b222132007-06-10 09:51:05 +00003090char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003091PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003092{
Christian Heimesf3863112007-11-22 07:46:41 +00003093 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003094 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3095
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003096 if (!PyUnicode_Check(unicode)) {
3097 PyErr_BadArgument();
3098 return NULL;
3099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003100 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003101 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003102
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003103 if (PyUnicode_UTF8(unicode) == NULL) {
3104 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003105 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3106 if (bytes == NULL)
3107 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003108 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3109 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003110 Py_DECREF(bytes);
3111 return NULL;
3112 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003113 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3114 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003115 Py_DECREF(bytes);
3116 }
3117
3118 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003119 *psize = PyUnicode_UTF8_LENGTH(unicode);
3120 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003121}
3122
3123char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003124PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003126 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3127}
3128
3129#ifdef Py_DEBUG
3130int unicode_as_unicode_calls = 0;
3131#endif
3132
3133
3134Py_UNICODE *
3135PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3136{
3137 PyUnicodeObject *u;
3138 const unsigned char *one_byte;
3139#if SIZEOF_WCHAR_T == 4
3140 const Py_UCS2 *two_bytes;
3141#else
3142 const Py_UCS4 *four_bytes;
3143 const Py_UCS4 *ucs4_end;
3144 Py_ssize_t num_surrogates;
3145#endif
3146 wchar_t *w;
3147 wchar_t *wchar_end;
3148
3149 if (!PyUnicode_Check(unicode)) {
3150 PyErr_BadArgument();
3151 return NULL;
3152 }
3153 u = (PyUnicodeObject*)unicode;
3154 if (_PyUnicode_WSTR(u) == NULL) {
3155 /* Non-ASCII compact unicode object */
3156 assert(_PyUnicode_KIND(u) != 0);
3157 assert(PyUnicode_IS_READY(u));
3158
3159#ifdef Py_DEBUG
3160 ++unicode_as_unicode_calls;
3161#endif
3162
3163 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3164#if SIZEOF_WCHAR_T == 2
3165 four_bytes = PyUnicode_4BYTE_DATA(u);
3166 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3167 num_surrogates = 0;
3168
3169 for (; four_bytes < ucs4_end; ++four_bytes) {
3170 if (*four_bytes > 0xFFFF)
3171 ++num_surrogates;
3172 }
3173
3174 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3175 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3176 if (!_PyUnicode_WSTR(u)) {
3177 PyErr_NoMemory();
3178 return NULL;
3179 }
3180 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3181
3182 w = _PyUnicode_WSTR(u);
3183 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3184 four_bytes = PyUnicode_4BYTE_DATA(u);
3185 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3186 if (*four_bytes > 0xFFFF) {
3187 /* encode surrogate pair in this case */
3188 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3189 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3190 }
3191 else
3192 *w = *four_bytes;
3193
3194 if (w > wchar_end) {
3195 assert(0 && "Miscalculated string end");
3196 }
3197 }
3198 *w = 0;
3199#else
3200 /* sizeof(wchar_t) == 4 */
3201 Py_FatalError("Impossible unicode object state, wstr and str "
3202 "should share memory already.");
3203 return NULL;
3204#endif
3205 }
3206 else {
3207 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3208 (_PyUnicode_LENGTH(u) + 1));
3209 if (!_PyUnicode_WSTR(u)) {
3210 PyErr_NoMemory();
3211 return NULL;
3212 }
3213 if (!PyUnicode_IS_COMPACT_ASCII(u))
3214 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3215 w = _PyUnicode_WSTR(u);
3216 wchar_end = w + _PyUnicode_LENGTH(u);
3217
3218 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3219 one_byte = PyUnicode_1BYTE_DATA(u);
3220 for (; w < wchar_end; ++one_byte, ++w)
3221 *w = *one_byte;
3222 /* null-terminate the wstr */
3223 *w = 0;
3224 }
3225 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3226#if SIZEOF_WCHAR_T == 4
3227 two_bytes = PyUnicode_2BYTE_DATA(u);
3228 for (; w < wchar_end; ++two_bytes, ++w)
3229 *w = *two_bytes;
3230 /* null-terminate the wstr */
3231 *w = 0;
3232#else
3233 /* sizeof(wchar_t) == 2 */
3234 PyObject_FREE(_PyUnicode_WSTR(u));
3235 _PyUnicode_WSTR(u) = NULL;
3236 Py_FatalError("Impossible unicode object state, wstr "
3237 "and str should share memory already.");
3238 return NULL;
3239#endif
3240 }
3241 else {
3242 assert(0 && "This should never happen.");
3243 }
3244 }
3245 }
3246 if (size != NULL)
3247 *size = PyUnicode_WSTR_LENGTH(u);
3248 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003249}
3250
Alexander Belopolsky40018472011-02-26 01:02:56 +00003251Py_UNICODE *
3252PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003254 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255}
3256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003257
Alexander Belopolsky40018472011-02-26 01:02:56 +00003258Py_ssize_t
3259PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260{
3261 if (!PyUnicode_Check(unicode)) {
3262 PyErr_BadArgument();
3263 goto onError;
3264 }
3265 return PyUnicode_GET_SIZE(unicode);
3266
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 return -1;
3269}
3270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003271Py_ssize_t
3272PyUnicode_GetLength(PyObject *unicode)
3273{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003274 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003275 PyErr_BadArgument();
3276 return -1;
3277 }
3278
3279 return PyUnicode_GET_LENGTH(unicode);
3280}
3281
3282Py_UCS4
3283PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3284{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003285 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3286 PyErr_BadArgument();
3287 return (Py_UCS4)-1;
3288 }
3289 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3290 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003291 return (Py_UCS4)-1;
3292 }
3293 return PyUnicode_READ_CHAR(unicode, index);
3294}
3295
3296int
3297PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3298{
3299 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003300 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 return -1;
3302 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003303 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3304 PyErr_SetString(PyExc_IndexError, "string index out of range");
3305 return -1;
3306 }
3307 if (_PyUnicode_Dirty(unicode))
3308 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003309 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3310 index, ch);
3311 return 0;
3312}
3313
Alexander Belopolsky40018472011-02-26 01:02:56 +00003314const char *
3315PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003316{
Victor Stinner42cb4622010-09-01 19:39:01 +00003317 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003318}
3319
Victor Stinner554f3f02010-06-16 23:33:54 +00003320/* create or adjust a UnicodeDecodeError */
3321static void
3322make_decode_exception(PyObject **exceptionObject,
3323 const char *encoding,
3324 const char *input, Py_ssize_t length,
3325 Py_ssize_t startpos, Py_ssize_t endpos,
3326 const char *reason)
3327{
3328 if (*exceptionObject == NULL) {
3329 *exceptionObject = PyUnicodeDecodeError_Create(
3330 encoding, input, length, startpos, endpos, reason);
3331 }
3332 else {
3333 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3334 goto onError;
3335 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3336 goto onError;
3337 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3338 goto onError;
3339 }
3340 return;
3341
3342onError:
3343 Py_DECREF(*exceptionObject);
3344 *exceptionObject = NULL;
3345}
3346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347/* error handling callback helper:
3348 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003349 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 and adjust various state variables.
3351 return 0 on success, -1 on error
3352*/
3353
Alexander Belopolsky40018472011-02-26 01:02:56 +00003354static int
3355unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003356 const char *encoding, const char *reason,
3357 const char **input, const char **inend, Py_ssize_t *startinpos,
3358 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3359 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003361 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003362
3363 PyObject *restuple = NULL;
3364 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003365 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003366 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003367 Py_ssize_t requiredsize;
3368 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003369 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003370 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003371 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 int res = -1;
3373
3374 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003375 *errorHandler = PyCodec_LookupError(errors);
3376 if (*errorHandler == NULL)
3377 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 }
3379
Victor Stinner554f3f02010-06-16 23:33:54 +00003380 make_decode_exception(exceptionObject,
3381 encoding,
3382 *input, *inend - *input,
3383 *startinpos, *endinpos,
3384 reason);
3385 if (*exceptionObject == NULL)
3386 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387
3388 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3389 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003390 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003392 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394 }
3395 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003397
3398 /* Copy back the bytes variables, which might have been modified by the
3399 callback */
3400 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3401 if (!inputobj)
3402 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003403 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003405 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003406 *input = PyBytes_AS_STRING(inputobj);
3407 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003408 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003409 /* we can DECREF safely, as the exception has another reference,
3410 so the object won't go away. */
3411 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003412
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003414 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003415 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003416 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3417 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003418 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419
3420 /* need more space? (at least enough for what we
3421 have+the replacement+the rest of the string (starting
3422 at the new input position), so we won't have to check space
3423 when there are no errors in the rest of the string) */
3424 repptr = PyUnicode_AS_UNICODE(repunicode);
3425 repsize = PyUnicode_GET_SIZE(repunicode);
3426 requiredsize = *outpos + repsize + insize-newpos;
3427 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 if (requiredsize<2*outsize)
3429 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003430 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003431 goto onError;
3432 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433 }
3434 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003435 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 Py_UNICODE_COPY(*outptr, repptr, repsize);
3437 *outptr += repsize;
3438 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003439
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 /* we made it! */
3441 res = 0;
3442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 Py_XDECREF(restuple);
3445 return res;
3446}
3447
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003448/* --- UTF-7 Codec -------------------------------------------------------- */
3449
Antoine Pitrou244651a2009-05-04 18:56:13 +00003450/* See RFC2152 for details. We encode conservatively and decode liberally. */
3451
3452/* Three simple macros defining base-64. */
3453
3454/* Is c a base-64 character? */
3455
3456#define IS_BASE64(c) \
3457 (((c) >= 'A' && (c) <= 'Z') || \
3458 ((c) >= 'a' && (c) <= 'z') || \
3459 ((c) >= '0' && (c) <= '9') || \
3460 (c) == '+' || (c) == '/')
3461
3462/* given that c is a base-64 character, what is its base-64 value? */
3463
3464#define FROM_BASE64(c) \
3465 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3466 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3467 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3468 (c) == '+' ? 62 : 63)
3469
3470/* What is the base-64 character of the bottom 6 bits of n? */
3471
3472#define TO_BASE64(n) \
3473 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3474
3475/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3476 * decoded as itself. We are permissive on decoding; the only ASCII
3477 * byte not decoding to itself is the + which begins a base64
3478 * string. */
3479
3480#define DECODE_DIRECT(c) \
3481 ((c) <= 127 && (c) != '+')
3482
3483/* The UTF-7 encoder treats ASCII characters differently according to
3484 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3485 * the above). See RFC2152. This array identifies these different
3486 * sets:
3487 * 0 : "Set D"
3488 * alphanumeric and '(),-./:?
3489 * 1 : "Set O"
3490 * !"#$%&*;<=>@[]^_`{|}
3491 * 2 : "whitespace"
3492 * ht nl cr sp
3493 * 3 : special (must be base64 encoded)
3494 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3495 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003496
Tim Petersced69f82003-09-16 20:30:58 +00003497static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003498char utf7_category[128] = {
3499/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3500 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3501/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3502 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3503/* sp ! " # $ % & ' ( ) * + , - . / */
3504 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3505/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3507/* @ A B C D E F G H I J K L M N O */
3508 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3509/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3511/* ` a b c d e f g h i j k l m n o */
3512 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3513/* p q r s t u v w x y z { | } ~ del */
3514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003515};
3516
Antoine Pitrou244651a2009-05-04 18:56:13 +00003517/* ENCODE_DIRECT: this character should be encoded as itself. The
3518 * answer depends on whether we are encoding set O as itself, and also
3519 * on whether we are encoding whitespace as itself. RFC2152 makes it
3520 * clear that the answers to these questions vary between
3521 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003522
Antoine Pitrou244651a2009-05-04 18:56:13 +00003523#define ENCODE_DIRECT(c, directO, directWS) \
3524 ((c) < 128 && (c) > 0 && \
3525 ((utf7_category[(c)] == 0) || \
3526 (directWS && (utf7_category[(c)] == 2)) || \
3527 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003528
Alexander Belopolsky40018472011-02-26 01:02:56 +00003529PyObject *
3530PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003531 Py_ssize_t size,
3532 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003533{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003534 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3535}
3536
Antoine Pitrou244651a2009-05-04 18:56:13 +00003537/* The decoder. The only state we preserve is our read position,
3538 * i.e. how many characters we have consumed. So if we end in the
3539 * middle of a shift sequence we have to back off the read position
3540 * and the output to the beginning of the sequence, otherwise we lose
3541 * all the shift state (seen bits, number of bits seen, high
3542 * surrogate). */
3543
Alexander Belopolsky40018472011-02-26 01:02:56 +00003544PyObject *
3545PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003546 Py_ssize_t size,
3547 const char *errors,
3548 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003551 Py_ssize_t startinpos;
3552 Py_ssize_t endinpos;
3553 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003554 const char *e;
3555 PyUnicodeObject *unicode;
3556 Py_UNICODE *p;
3557 const char *errmsg = "";
3558 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003559 Py_UNICODE *shiftOutStart;
3560 unsigned int base64bits = 0;
3561 unsigned long base64buffer = 0;
3562 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 PyObject *errorHandler = NULL;
3564 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003565
3566 unicode = _PyUnicode_New(size);
3567 if (!unicode)
3568 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003569 if (size == 0) {
3570 if (consumed)
3571 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003572 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003573 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003575 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003576 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003577 e = s + size;
3578
3579 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003582 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003583
Antoine Pitrou244651a2009-05-04 18:56:13 +00003584 if (inShift) { /* in a base-64 section */
3585 if (IS_BASE64(ch)) { /* consume a base-64 character */
3586 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3587 base64bits += 6;
3588 s++;
3589 if (base64bits >= 16) {
3590 /* we have enough bits for a UTF-16 value */
3591 Py_UNICODE outCh = (Py_UNICODE)
3592 (base64buffer >> (base64bits-16));
3593 base64bits -= 16;
3594 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3595 if (surrogate) {
3596 /* expecting a second surrogate */
3597 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3598#ifdef Py_UNICODE_WIDE
3599 *p++ = (((surrogate & 0x3FF)<<10)
3600 | (outCh & 0x3FF)) + 0x10000;
3601#else
3602 *p++ = surrogate;
3603 *p++ = outCh;
3604#endif
3605 surrogate = 0;
3606 }
3607 else {
3608 surrogate = 0;
3609 errmsg = "second surrogate missing";
3610 goto utf7Error;
3611 }
3612 }
3613 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3614 /* first surrogate */
3615 surrogate = outCh;
3616 }
3617 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3618 errmsg = "unexpected second surrogate";
3619 goto utf7Error;
3620 }
3621 else {
3622 *p++ = outCh;
3623 }
3624 }
3625 }
3626 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003627 inShift = 0;
3628 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003629 if (surrogate) {
3630 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003631 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003632 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003633 if (base64bits > 0) { /* left-over bits */
3634 if (base64bits >= 6) {
3635 /* We've seen at least one base-64 character */
3636 errmsg = "partial character in shift sequence";
3637 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003638 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003639 else {
3640 /* Some bits remain; they should be zero */
3641 if (base64buffer != 0) {
3642 errmsg = "non-zero padding bits in shift sequence";
3643 goto utf7Error;
3644 }
3645 }
3646 }
3647 if (ch != '-') {
3648 /* '-' is absorbed; other terminating
3649 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003650 *p++ = ch;
3651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003652 }
3653 }
3654 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003656 s++; /* consume '+' */
3657 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003658 s++;
3659 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003660 }
3661 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003662 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003663 shiftOutStart = p;
3664 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003665 }
3666 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003667 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003668 *p++ = ch;
3669 s++;
3670 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003671 else {
3672 startinpos = s-starts;
3673 s++;
3674 errmsg = "unexpected special character";
3675 goto utf7Error;
3676 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003677 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003678utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 outpos = p-PyUnicode_AS_UNICODE(unicode);
3680 endinpos = s-starts;
3681 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003682 errors, &errorHandler,
3683 "utf7", errmsg,
3684 &starts, &e, &startinpos, &endinpos, &exc, &s,
3685 &unicode, &outpos, &p))
3686 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003687 }
3688
Antoine Pitrou244651a2009-05-04 18:56:13 +00003689 /* end of string */
3690
3691 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3692 /* if we're in an inconsistent state, that's an error */
3693 if (surrogate ||
3694 (base64bits >= 6) ||
3695 (base64bits > 0 && base64buffer != 0)) {
3696 outpos = p-PyUnicode_AS_UNICODE(unicode);
3697 endinpos = size;
3698 if (unicode_decode_call_errorhandler(
3699 errors, &errorHandler,
3700 "utf7", "unterminated shift sequence",
3701 &starts, &e, &startinpos, &endinpos, &exc, &s,
3702 &unicode, &outpos, &p))
3703 goto onError;
3704 if (s < e)
3705 goto restart;
3706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003708
3709 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003710 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003711 if (inShift) {
3712 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003713 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003714 }
3715 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003716 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003717 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003718 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003719
Victor Stinnerfe226c02011-10-03 03:52:20 +02003720 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003721 goto onError;
3722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 Py_XDECREF(errorHandler);
3724 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003725#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003726 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003727 Py_DECREF(unicode);
3728 return NULL;
3729 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003730#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003731 return (PyObject *)unicode;
3732
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 Py_XDECREF(errorHandler);
3735 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003736 Py_DECREF(unicode);
3737 return NULL;
3738}
3739
3740
Alexander Belopolsky40018472011-02-26 01:02:56 +00003741PyObject *
3742PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003743 Py_ssize_t size,
3744 int base64SetO,
3745 int base64WhiteSpace,
3746 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003747{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003748 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003749 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003750 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003751 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003752 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003753 unsigned int base64bits = 0;
3754 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003755 char * out;
3756 char * start;
3757
3758 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003760
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003761 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003762 return PyErr_NoMemory();
3763
Antoine Pitrou244651a2009-05-04 18:56:13 +00003764 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003765 if (v == NULL)
3766 return NULL;
3767
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003768 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003769 for (;i < size; ++i) {
3770 Py_UNICODE ch = s[i];
3771
Antoine Pitrou244651a2009-05-04 18:56:13 +00003772 if (inShift) {
3773 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3774 /* shifting out */
3775 if (base64bits) { /* output remaining bits */
3776 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3777 base64buffer = 0;
3778 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003779 }
3780 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003781 /* Characters not in the BASE64 set implicitly unshift the sequence
3782 so no '-' is required, except if the character is itself a '-' */
3783 if (IS_BASE64(ch) || ch == '-') {
3784 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003785 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003786 *out++ = (char) ch;
3787 }
3788 else {
3789 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003790 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003791 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003792 else { /* not in a shift sequence */
3793 if (ch == '+') {
3794 *out++ = '+';
3795 *out++ = '-';
3796 }
3797 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3798 *out++ = (char) ch;
3799 }
3800 else {
3801 *out++ = '+';
3802 inShift = 1;
3803 goto encode_char;
3804 }
3805 }
3806 continue;
3807encode_char:
3808#ifdef Py_UNICODE_WIDE
3809 if (ch >= 0x10000) {
3810 /* code first surrogate */
3811 base64bits += 16;
3812 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3813 while (base64bits >= 6) {
3814 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3815 base64bits -= 6;
3816 }
3817 /* prepare second surrogate */
3818 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3819 }
3820#endif
3821 base64bits += 16;
3822 base64buffer = (base64buffer << 16) | ch;
3823 while (base64bits >= 6) {
3824 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3825 base64bits -= 6;
3826 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003827 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003828 if (base64bits)
3829 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3830 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003831 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003832 if (_PyBytes_Resize(&v, out - start) < 0)
3833 return NULL;
3834 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003835}
3836
Antoine Pitrou244651a2009-05-04 18:56:13 +00003837#undef IS_BASE64
3838#undef FROM_BASE64
3839#undef TO_BASE64
3840#undef DECODE_DIRECT
3841#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003842
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843/* --- UTF-8 Codec -------------------------------------------------------- */
3844
Tim Petersced69f82003-09-16 20:30:58 +00003845static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003847 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3848 illegal prefix. See RFC 3629 for details */
3849 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3850 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003851 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3853 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3859 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003860 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3861 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3862 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3863 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3864 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865};
3866
Alexander Belopolsky40018472011-02-26 01:02:56 +00003867PyObject *
3868PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003869 Py_ssize_t size,
3870 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871{
Walter Dörwald69652032004-09-07 20:24:22 +00003872 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3873}
3874
Antoine Pitrouab868312009-01-10 15:40:25 +00003875/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3876#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3877
3878/* Mask to quickly check whether a C 'long' contains a
3879 non-ASCII, UTF8-encoded char. */
3880#if (SIZEOF_LONG == 8)
3881# define ASCII_CHAR_MASK 0x8080808080808080L
3882#elif (SIZEOF_LONG == 4)
3883# define ASCII_CHAR_MASK 0x80808080L
3884#else
3885# error C 'long' size should be either 4 or 8!
3886#endif
3887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888/* Scans a UTF-8 string and returns the maximum character to be expected,
3889 the size of the decoded unicode string and if any major errors were
3890 encountered.
3891
3892 This function does check basic UTF-8 sanity, it does however NOT CHECK
3893 if the string contains surrogates, and if all continuation bytes are
3894 within the correct ranges, these checks are performed in
3895 PyUnicode_DecodeUTF8Stateful.
3896
3897 If it sets has_errors to 1, it means the value of unicode_size and max_char
3898 will be bogus and you should not rely on useful information in them.
3899 */
3900static Py_UCS4
3901utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3902 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3903 int *has_errors)
3904{
3905 Py_ssize_t n;
3906 Py_ssize_t char_count = 0;
3907 Py_UCS4 max_char = 127, new_max;
3908 Py_UCS4 upper_bound;
3909 const unsigned char *p = (const unsigned char *)s;
3910 const unsigned char *end = p + string_size;
3911 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3912 int err = 0;
3913
3914 for (; p < end && !err; ++p, ++char_count) {
3915 /* Only check value if it's not a ASCII char... */
3916 if (*p < 0x80) {
3917 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3918 an explanation. */
3919 if (!((size_t) p & LONG_PTR_MASK)) {
3920 /* Help register allocation */
3921 register const unsigned char *_p = p;
3922 while (_p < aligned_end) {
3923 unsigned long value = *(unsigned long *) _p;
3924 if (value & ASCII_CHAR_MASK)
3925 break;
3926 _p += SIZEOF_LONG;
3927 char_count += SIZEOF_LONG;
3928 }
3929 p = _p;
3930 if (p == end)
3931 break;
3932 }
3933 }
3934 if (*p >= 0x80) {
3935 n = utf8_code_length[*p];
3936 new_max = max_char;
3937 switch (n) {
3938 /* invalid start byte */
3939 case 0:
3940 err = 1;
3941 break;
3942 case 2:
3943 /* Code points between 0x00FF and 0x07FF inclusive.
3944 Approximate the upper bound of the code point,
3945 if this flips over 255 we can be sure it will be more
3946 than 255 and the string will need 2 bytes per code coint,
3947 if it stays under or equal to 255, we can be sure 1 byte
3948 is enough.
3949 ((*p & 0b00011111) << 6) | 0b00111111 */
3950 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3951 if (max_char < upper_bound)
3952 new_max = upper_bound;
3953 /* Ensure we track at least that we left ASCII space. */
3954 if (new_max < 128)
3955 new_max = 128;
3956 break;
3957 case 3:
3958 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3959 always > 255 and <= 65535 and will always need 2 bytes. */
3960 if (max_char < 65535)
3961 new_max = 65535;
3962 break;
3963 case 4:
3964 /* Code point will be above 0xFFFF for sure in this case. */
3965 new_max = 65537;
3966 break;
3967 /* Internal error, this should be caught by the first if */
3968 case 1:
3969 default:
3970 assert(0 && "Impossible case in utf8_max_char_and_size");
3971 err = 1;
3972 }
3973 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02003974 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975 --n;
3976 /* Check if the follow up chars are all valid continuation bytes */
3977 if (n >= 1) {
3978 const unsigned char *cont;
3979 if ((p + n) >= end) {
3980 if (consumed == 0)
3981 /* incomplete data, non-incremental decoding */
3982 err = 1;
3983 break;
3984 }
3985 for (cont = p + 1; cont < (p + n); ++cont) {
3986 if ((*cont & 0xc0) != 0x80) {
3987 err = 1;
3988 break;
3989 }
3990 }
3991 p += n;
3992 }
3993 else
3994 err = 1;
3995 max_char = new_max;
3996 }
3997 }
3998
3999 if (unicode_size)
4000 *unicode_size = char_count;
4001 if (has_errors)
4002 *has_errors = err;
4003 return max_char;
4004}
4005
4006/* Similar to PyUnicode_WRITE but can also write into wstr field
4007 of the legacy unicode representation */
4008#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4009 do { \
4010 const int k_ = (kind); \
4011 if (k_ == PyUnicode_WCHAR_KIND) \
4012 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4013 else if (k_ == PyUnicode_1BYTE_KIND) \
4014 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4015 else if (k_ == PyUnicode_2BYTE_KIND) \
4016 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4017 else \
4018 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4019 } while (0)
4020
Alexander Belopolsky40018472011-02-26 01:02:56 +00004021PyObject *
4022PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 Py_ssize_t size,
4024 const char *errors,
4025 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004026{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004029 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004030 Py_ssize_t startinpos;
4031 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004032 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004034 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004035 PyObject *errorHandler = NULL;
4036 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 Py_UCS4 maxchar = 0;
4038 Py_ssize_t unicode_size;
4039 Py_ssize_t i;
4040 int kind;
4041 void *data;
4042 int has_errors;
4043 Py_UNICODE *error_outptr;
4044#if SIZEOF_WCHAR_T == 2
4045 Py_ssize_t wchar_offset = 0;
4046#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047
Walter Dörwald69652032004-09-07 20:24:22 +00004048 if (size == 0) {
4049 if (consumed)
4050 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4054 consumed, &has_errors);
4055 if (has_errors) {
4056 unicode = _PyUnicode_New(size);
4057 if (!unicode)
4058 return NULL;
4059 kind = PyUnicode_WCHAR_KIND;
4060 data = PyUnicode_AS_UNICODE(unicode);
4061 assert(data != NULL);
4062 }
4063 else {
4064 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4065 if (!unicode)
4066 return NULL;
4067 /* When the string is ASCII only, just use memcpy and return.
4068 unicode_size may be != size if there is an incomplete UTF-8
4069 sequence at the end of the ASCII block. */
4070 if (maxchar < 128 && size == unicode_size) {
4071 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4072 return (PyObject *)unicode;
4073 }
4074 kind = PyUnicode_KIND(unicode);
4075 data = PyUnicode_DATA(unicode);
4076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004080 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081
4082 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004083 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084
4085 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004086 /* Fast path for runs of ASCII characters. Given that common UTF-8
4087 input will consist of an overwhelming majority of ASCII
4088 characters, we try to optimize for this case by checking
4089 as many characters as a C 'long' can contain.
4090 First, check if we can do an aligned read, as most CPUs have
4091 a penalty for unaligned reads.
4092 */
4093 if (!((size_t) s & LONG_PTR_MASK)) {
4094 /* Help register allocation */
4095 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004097 while (_s < aligned_end) {
4098 /* Read a whole long at a time (either 4 or 8 bytes),
4099 and do a fast unrolled copy if it only contains ASCII
4100 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101 unsigned long value = *(unsigned long *) _s;
4102 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004103 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4105 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4106 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4107 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004108#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4110 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4111 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4112 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004113#endif
4114 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004116 }
4117 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004119 if (s == e)
4120 break;
4121 ch = (unsigned char)*s;
4122 }
4123 }
4124
4125 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 s++;
4128 continue;
4129 }
4130
4131 n = utf8_code_length[ch];
4132
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004133 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 if (consumed)
4135 break;
4136 else {
4137 errmsg = "unexpected end of data";
4138 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004139 endinpos = startinpos+1;
4140 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4141 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 goto utf8Error;
4143 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145
4146 switch (n) {
4147
4148 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004149 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 startinpos = s-starts;
4151 endinpos = startinpos+1;
4152 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153
4154 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004155 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 startinpos = s-starts;
4157 endinpos = startinpos+1;
4158 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159
4160 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004161 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004162 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004164 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004165 goto utf8Error;
4166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004168 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 break;
4171
4172 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004173 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4174 will result in surrogates in range d800-dfff. Surrogates are
4175 not valid UTF-8 so they are rejected.
4176 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4177 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004178 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 (s[2] & 0xc0) != 0x80 ||
4180 ((unsigned char)s[0] == 0xE0 &&
4181 (unsigned char)s[1] < 0xA0) ||
4182 ((unsigned char)s[0] == 0xED &&
4183 (unsigned char)s[1] > 0x9F)) {
4184 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004186 endinpos = startinpos + 1;
4187
4188 /* if s[1] first two bits are 1 and 0, then the invalid
4189 continuation byte is s[2], so increment endinpos by 1,
4190 if not, s[1] is invalid and endinpos doesn't need to
4191 be incremented. */
4192 if ((s[1] & 0xC0) == 0x80)
4193 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 goto utf8Error;
4195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004197 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004199 break;
4200
4201 case 4:
4202 if ((s[1] & 0xc0) != 0x80 ||
4203 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004204 (s[3] & 0xc0) != 0x80 ||
4205 ((unsigned char)s[0] == 0xF0 &&
4206 (unsigned char)s[1] < 0x90) ||
4207 ((unsigned char)s[0] == 0xF4 &&
4208 (unsigned char)s[1] > 0x8F)) {
4209 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004211 endinpos = startinpos + 1;
4212 if ((s[1] & 0xC0) == 0x80) {
4213 endinpos++;
4214 if ((s[2] & 0xC0) == 0x80)
4215 endinpos++;
4216 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 goto utf8Error;
4218 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004219 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004220 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4221 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004223 /* If the string is flexible or we have native UCS-4, write
4224 directly.. */
4225 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4226 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228 else {
4229 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231 /* translate from 10000..10FFFF to 0..FFFF */
4232 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004234 /* high surrogate = top 10 bits added to D800 */
4235 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4236 (Py_UNICODE)(0xD800 + (ch >> 10)));
4237
4238 /* low surrogate = bottom 10 bits added to DC00 */
4239 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4240 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4241 }
4242#if SIZEOF_WCHAR_T == 2
4243 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004244#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246 }
4247 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004249
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004251 /* If this is not yet a resizable string, make it one.. */
4252 if (kind != PyUnicode_WCHAR_KIND) {
4253 const Py_UNICODE *u;
4254 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4255 if (!new_unicode)
4256 goto onError;
4257 u = PyUnicode_AsUnicode((PyObject *)unicode);
4258 if (!u)
4259 goto onError;
4260#if SIZEOF_WCHAR_T == 2
4261 i += wchar_offset;
4262#endif
4263 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4264 Py_DECREF(unicode);
4265 unicode = new_unicode;
4266 kind = 0;
4267 data = PyUnicode_AS_UNICODE(new_unicode);
4268 assert(data != NULL);
4269 }
4270 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 if (unicode_decode_call_errorhandler(
4272 errors, &errorHandler,
4273 "utf8", errmsg,
4274 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004275 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004277 /* Update data because unicode_decode_call_errorhandler might have
4278 re-created or resized the unicode object. */
4279 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004282 /* Ensure the unicode_size calculation above was correct: */
4283 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4284
Walter Dörwald69652032004-09-07 20:24:22 +00004285 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004288 /* Adjust length and ready string when it contained errors and
4289 is of the old resizable kind. */
4290 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004291 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004292 goto onError;
4293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004297#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004298 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004299 Py_DECREF(unicode);
4300 return NULL;
4301 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004302#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 return (PyObject *)unicode;
4304
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 Py_XDECREF(errorHandler);
4307 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308 Py_DECREF(unicode);
4309 return NULL;
4310}
4311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004312#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004313
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004314#ifdef __APPLE__
4315
4316/* Simplified UTF-8 decoder using surrogateescape error handler,
4317 used to decode the command line arguments on Mac OS X. */
4318
4319wchar_t*
4320_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4321{
4322 int n;
4323 const char *e;
4324 wchar_t *unicode, *p;
4325
4326 /* Note: size will always be longer than the resulting Unicode
4327 character count */
4328 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4329 PyErr_NoMemory();
4330 return NULL;
4331 }
4332 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4333 if (!unicode)
4334 return NULL;
4335
4336 /* Unpack UTF-8 encoded data */
4337 p = unicode;
4338 e = s + size;
4339 while (s < e) {
4340 Py_UCS4 ch = (unsigned char)*s;
4341
4342 if (ch < 0x80) {
4343 *p++ = (wchar_t)ch;
4344 s++;
4345 continue;
4346 }
4347
4348 n = utf8_code_length[ch];
4349 if (s + n > e) {
4350 goto surrogateescape;
4351 }
4352
4353 switch (n) {
4354 case 0:
4355 case 1:
4356 goto surrogateescape;
4357
4358 case 2:
4359 if ((s[1] & 0xc0) != 0x80)
4360 goto surrogateescape;
4361 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4362 assert ((ch > 0x007F) && (ch <= 0x07FF));
4363 *p++ = (wchar_t)ch;
4364 break;
4365
4366 case 3:
4367 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4368 will result in surrogates in range d800-dfff. Surrogates are
4369 not valid UTF-8 so they are rejected.
4370 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4371 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4372 if ((s[1] & 0xc0) != 0x80 ||
4373 (s[2] & 0xc0) != 0x80 ||
4374 ((unsigned char)s[0] == 0xE0 &&
4375 (unsigned char)s[1] < 0xA0) ||
4376 ((unsigned char)s[0] == 0xED &&
4377 (unsigned char)s[1] > 0x9F)) {
4378
4379 goto surrogateescape;
4380 }
4381 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4382 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004383 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004384 break;
4385
4386 case 4:
4387 if ((s[1] & 0xc0) != 0x80 ||
4388 (s[2] & 0xc0) != 0x80 ||
4389 (s[3] & 0xc0) != 0x80 ||
4390 ((unsigned char)s[0] == 0xF0 &&
4391 (unsigned char)s[1] < 0x90) ||
4392 ((unsigned char)s[0] == 0xF4 &&
4393 (unsigned char)s[1] > 0x8F)) {
4394 goto surrogateescape;
4395 }
4396 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4397 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4398 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4399
4400#if SIZEOF_WCHAR_T == 4
4401 *p++ = (wchar_t)ch;
4402#else
4403 /* compute and append the two surrogates: */
4404
4405 /* translate from 10000..10FFFF to 0..FFFF */
4406 ch -= 0x10000;
4407
4408 /* high surrogate = top 10 bits added to D800 */
4409 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4410
4411 /* low surrogate = bottom 10 bits added to DC00 */
4412 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4413#endif
4414 break;
4415 }
4416 s += n;
4417 continue;
4418
4419 surrogateescape:
4420 *p++ = 0xDC00 + ch;
4421 s++;
4422 }
4423 *p = L'\0';
4424 return unicode;
4425}
4426
4427#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004429/* Primary internal function which creates utf8 encoded bytes objects.
4430
4431 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004432 and allocate exactly as much space needed at the end. Else allocate the
4433 maximum possible needed (4 result bytes per Unicode character), and return
4434 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004435*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004436PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004437_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438{
Tim Peters602f7402002-04-27 18:03:26 +00004439#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004440
Guido van Rossum98297ee2007-11-06 21:34:58 +00004441 Py_ssize_t i; /* index into s of next input byte */
4442 PyObject *result; /* result string object */
4443 char *p; /* next free byte in output buffer */
4444 Py_ssize_t nallocated; /* number of result bytes allocated */
4445 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004446 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004447 PyObject *errorHandler = NULL;
4448 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004449 int kind;
4450 void *data;
4451 Py_ssize_t size;
4452 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4453#if SIZEOF_WCHAR_T == 2
4454 Py_ssize_t wchar_offset = 0;
4455#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004457 if (!PyUnicode_Check(unicode)) {
4458 PyErr_BadArgument();
4459 return NULL;
4460 }
4461
4462 if (PyUnicode_READY(unicode) == -1)
4463 return NULL;
4464
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004465 if (PyUnicode_UTF8(unicode))
4466 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4467 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004468
4469 kind = PyUnicode_KIND(unicode);
4470 data = PyUnicode_DATA(unicode);
4471 size = PyUnicode_GET_LENGTH(unicode);
4472
Tim Peters602f7402002-04-27 18:03:26 +00004473 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474
Tim Peters602f7402002-04-27 18:03:26 +00004475 if (size <= MAX_SHORT_UNICHARS) {
4476 /* Write into the stack buffer; nallocated can't overflow.
4477 * At the end, we'll allocate exactly as much heap space as it
4478 * turns out we need.
4479 */
4480 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004481 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004482 p = stackbuf;
4483 }
4484 else {
4485 /* Overallocate on the heap, and give the excess back at the end. */
4486 nallocated = size * 4;
4487 if (nallocated / 4 != size) /* overflow! */
4488 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004489 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004490 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004491 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004492 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004493 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004494
Tim Peters602f7402002-04-27 18:03:26 +00004495 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004496 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004497
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004498 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004499 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004501
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004503 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004504 *p++ = (char)(0xc0 | (ch >> 6));
4505 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004506 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004507 Py_ssize_t newpos;
4508 PyObject *rep;
4509 Py_ssize_t repsize, k, startpos;
4510 startpos = i-1;
4511#if SIZEOF_WCHAR_T == 2
4512 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004513#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 rep = unicode_encode_call_errorhandler(
4515 errors, &errorHandler, "utf-8", "surrogates not allowed",
4516 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4517 &exc, startpos, startpos+1, &newpos);
4518 if (!rep)
4519 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004521 if (PyBytes_Check(rep))
4522 repsize = PyBytes_GET_SIZE(rep);
4523 else
4524 repsize = PyUnicode_GET_SIZE(rep);
4525
4526 if (repsize > 4) {
4527 Py_ssize_t offset;
4528
4529 if (result == NULL)
4530 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004531 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004532 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004534 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4535 /* integer overflow */
4536 PyErr_NoMemory();
4537 goto error;
4538 }
4539 nallocated += repsize - 4;
4540 if (result != NULL) {
4541 if (_PyBytes_Resize(&result, nallocated) < 0)
4542 goto error;
4543 } else {
4544 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004545 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004546 goto error;
4547 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4548 }
4549 p = PyBytes_AS_STRING(result) + offset;
4550 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004552 if (PyBytes_Check(rep)) {
4553 char *prep = PyBytes_AS_STRING(rep);
4554 for(k = repsize; k > 0; k--)
4555 *p++ = *prep++;
4556 } else /* rep is unicode */ {
4557 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4558 Py_UNICODE c;
4559
4560 for(k=0; k<repsize; k++) {
4561 c = prep[k];
4562 if (0x80 <= c) {
4563 raise_encode_exception(&exc, "utf-8",
4564 PyUnicode_AS_UNICODE(unicode),
4565 size, i-1, i,
4566 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004567 goto error;
4568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004569 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004570 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004572 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004573 } else if (ch < 0x10000) {
4574 *p++ = (char)(0xe0 | (ch >> 12));
4575 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4576 *p++ = (char)(0x80 | (ch & 0x3f));
4577 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004578 /* Encode UCS4 Unicode ordinals */
4579 *p++ = (char)(0xf0 | (ch >> 18));
4580 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4581 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4582 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583#if SIZEOF_WCHAR_T == 2
4584 wchar_offset++;
4585#endif
Tim Peters602f7402002-04-27 18:03:26 +00004586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004588
Guido van Rossum98297ee2007-11-06 21:34:58 +00004589 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004590 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004591 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004592 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004593 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004594 }
4595 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004596 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004597 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004598 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004599 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004601
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004602 Py_XDECREF(errorHandler);
4603 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004604 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004605 error:
4606 Py_XDECREF(errorHandler);
4607 Py_XDECREF(exc);
4608 Py_XDECREF(result);
4609 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004610
Tim Peters602f7402002-04-27 18:03:26 +00004611#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612}
4613
Alexander Belopolsky40018472011-02-26 01:02:56 +00004614PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004615PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4616 Py_ssize_t size,
4617 const char *errors)
4618{
4619 PyObject *v, *unicode;
4620
4621 unicode = PyUnicode_FromUnicode(s, size);
4622 if (unicode == NULL)
4623 return NULL;
4624 v = _PyUnicode_AsUTF8String(unicode, errors);
4625 Py_DECREF(unicode);
4626 return v;
4627}
4628
4629PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004630PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004632 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633}
4634
Walter Dörwald41980ca2007-08-16 21:55:45 +00004635/* --- UTF-32 Codec ------------------------------------------------------- */
4636
4637PyObject *
4638PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 Py_ssize_t size,
4640 const char *errors,
4641 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004642{
4643 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4644}
4645
4646PyObject *
4647PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 Py_ssize_t size,
4649 const char *errors,
4650 int *byteorder,
4651 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004652{
4653 const char *starts = s;
4654 Py_ssize_t startinpos;
4655 Py_ssize_t endinpos;
4656 Py_ssize_t outpos;
4657 PyUnicodeObject *unicode;
4658 Py_UNICODE *p;
4659#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004660 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004661 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004662#else
4663 const int pairs = 0;
4664#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004665 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004666 int bo = 0; /* assume native ordering by default */
4667 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004668 /* Offsets from q for retrieving bytes in the right order. */
4669#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4670 int iorder[] = {0, 1, 2, 3};
4671#else
4672 int iorder[] = {3, 2, 1, 0};
4673#endif
4674 PyObject *errorHandler = NULL;
4675 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004676
Walter Dörwald41980ca2007-08-16 21:55:45 +00004677 q = (unsigned char *)s;
4678 e = q + size;
4679
4680 if (byteorder)
4681 bo = *byteorder;
4682
4683 /* Check for BOM marks (U+FEFF) in the input and adjust current
4684 byte order setting accordingly. In native mode, the leading BOM
4685 mark is skipped, in all other modes, it is copied to the output
4686 stream as-is (giving a ZWNBSP character). */
4687 if (bo == 0) {
4688 if (size >= 4) {
4689 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004691#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 if (bom == 0x0000FEFF) {
4693 q += 4;
4694 bo = -1;
4695 }
4696 else if (bom == 0xFFFE0000) {
4697 q += 4;
4698 bo = 1;
4699 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004700#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 if (bom == 0x0000FEFF) {
4702 q += 4;
4703 bo = 1;
4704 }
4705 else if (bom == 0xFFFE0000) {
4706 q += 4;
4707 bo = -1;
4708 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004709#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004710 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004711 }
4712
4713 if (bo == -1) {
4714 /* force LE */
4715 iorder[0] = 0;
4716 iorder[1] = 1;
4717 iorder[2] = 2;
4718 iorder[3] = 3;
4719 }
4720 else if (bo == 1) {
4721 /* force BE */
4722 iorder[0] = 3;
4723 iorder[1] = 2;
4724 iorder[2] = 1;
4725 iorder[3] = 0;
4726 }
4727
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004728 /* On narrow builds we split characters outside the BMP into two
4729 codepoints => count how much extra space we need. */
4730#ifndef Py_UNICODE_WIDE
4731 for (qq = q; qq < e; qq += 4)
4732 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4733 pairs++;
4734#endif
4735
4736 /* This might be one to much, because of a BOM */
4737 unicode = _PyUnicode_New((size+3)/4+pairs);
4738 if (!unicode)
4739 return NULL;
4740 if (size == 0)
4741 return (PyObject *)unicode;
4742
4743 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004744 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004745
Walter Dörwald41980ca2007-08-16 21:55:45 +00004746 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004747 Py_UCS4 ch;
4748 /* remaining bytes at the end? (size should be divisible by 4) */
4749 if (e-q<4) {
4750 if (consumed)
4751 break;
4752 errmsg = "truncated data";
4753 startinpos = ((const char *)q)-starts;
4754 endinpos = ((const char *)e)-starts;
4755 goto utf32Error;
4756 /* The remaining input chars are ignored if the callback
4757 chooses to skip the input */
4758 }
4759 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4760 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004761
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 if (ch >= 0x110000)
4763 {
4764 errmsg = "codepoint not in range(0x110000)";
4765 startinpos = ((const char *)q)-starts;
4766 endinpos = startinpos+4;
4767 goto utf32Error;
4768 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004769#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 if (ch >= 0x10000)
4771 {
4772 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4773 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4774 }
4775 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004776#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 *p++ = ch;
4778 q += 4;
4779 continue;
4780 utf32Error:
4781 outpos = p-PyUnicode_AS_UNICODE(unicode);
4782 if (unicode_decode_call_errorhandler(
4783 errors, &errorHandler,
4784 "utf32", errmsg,
4785 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4786 &unicode, &outpos, &p))
4787 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004788 }
4789
4790 if (byteorder)
4791 *byteorder = bo;
4792
4793 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004795
4796 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004797 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004798 goto onError;
4799
4800 Py_XDECREF(errorHandler);
4801 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004802#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004803 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004804 Py_DECREF(unicode);
4805 return NULL;
4806 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004807#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00004808 return (PyObject *)unicode;
4809
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004811 Py_DECREF(unicode);
4812 Py_XDECREF(errorHandler);
4813 Py_XDECREF(exc);
4814 return NULL;
4815}
4816
4817PyObject *
4818PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 Py_ssize_t size,
4820 const char *errors,
4821 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004822{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004823 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004824 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004825 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004826#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004827 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004828#else
4829 const int pairs = 0;
4830#endif
4831 /* Offsets from p for storing byte pairs in the right order. */
4832#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4833 int iorder[] = {0, 1, 2, 3};
4834#else
4835 int iorder[] = {3, 2, 1, 0};
4836#endif
4837
Benjamin Peterson29060642009-01-31 22:14:21 +00004838#define STORECHAR(CH) \
4839 do { \
4840 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4841 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4842 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4843 p[iorder[0]] = (CH) & 0xff; \
4844 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004845 } while(0)
4846
4847 /* In narrow builds we can output surrogate pairs as one codepoint,
4848 so we need less space. */
4849#ifndef Py_UNICODE_WIDE
4850 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4852 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4853 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004854#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004855 nsize = (size - pairs + (byteorder == 0));
4856 bytesize = nsize * 4;
4857 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004859 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004860 if (v == NULL)
4861 return NULL;
4862
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004863 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004866 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004867 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004868
4869 if (byteorder == -1) {
4870 /* force LE */
4871 iorder[0] = 0;
4872 iorder[1] = 1;
4873 iorder[2] = 2;
4874 iorder[3] = 3;
4875 }
4876 else if (byteorder == 1) {
4877 /* force BE */
4878 iorder[0] = 3;
4879 iorder[1] = 2;
4880 iorder[2] = 1;
4881 iorder[3] = 0;
4882 }
4883
4884 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004886#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4888 Py_UCS4 ch2 = *s;
4889 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4890 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4891 s++;
4892 size--;
4893 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004894 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004895#endif
4896 STORECHAR(ch);
4897 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004898
4899 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004900 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004901#undef STORECHAR
4902}
4903
Alexander Belopolsky40018472011-02-26 01:02:56 +00004904PyObject *
4905PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906{
4907 if (!PyUnicode_Check(unicode)) {
4908 PyErr_BadArgument();
4909 return NULL;
4910 }
4911 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 PyUnicode_GET_SIZE(unicode),
4913 NULL,
4914 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004915}
4916
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917/* --- UTF-16 Codec ------------------------------------------------------- */
4918
Tim Peters772747b2001-08-09 22:21:55 +00004919PyObject *
4920PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 Py_ssize_t size,
4922 const char *errors,
4923 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924{
Walter Dörwald69652032004-09-07 20:24:22 +00004925 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4926}
4927
Antoine Pitrouab868312009-01-10 15:40:25 +00004928/* Two masks for fast checking of whether a C 'long' may contain
4929 UTF16-encoded surrogate characters. This is an efficient heuristic,
4930 assuming that non-surrogate characters with a code point >= 0x8000 are
4931 rare in most input.
4932 FAST_CHAR_MASK is used when the input is in native byte ordering,
4933 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004934*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004935#if (SIZEOF_LONG == 8)
4936# define FAST_CHAR_MASK 0x8000800080008000L
4937# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4938#elif (SIZEOF_LONG == 4)
4939# define FAST_CHAR_MASK 0x80008000L
4940# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4941#else
4942# error C 'long' size should be either 4 or 8!
4943#endif
4944
Walter Dörwald69652032004-09-07 20:24:22 +00004945PyObject *
4946PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 Py_ssize_t size,
4948 const char *errors,
4949 int *byteorder,
4950 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004951{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004953 Py_ssize_t startinpos;
4954 Py_ssize_t endinpos;
4955 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 PyUnicodeObject *unicode;
4957 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004958 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004959 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004960 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004961 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004962 /* Offsets from q for retrieving byte pairs in the right order. */
4963#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4964 int ihi = 1, ilo = 0;
4965#else
4966 int ihi = 0, ilo = 1;
4967#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004968 PyObject *errorHandler = NULL;
4969 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970
4971 /* Note: size will always be longer than the resulting Unicode
4972 character count */
4973 unicode = _PyUnicode_New(size);
4974 if (!unicode)
4975 return NULL;
4976 if (size == 0)
4977 return (PyObject *)unicode;
4978
4979 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004980 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004981 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004982 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983
4984 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004985 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004987 /* Check for BOM marks (U+FEFF) in the input and adjust current
4988 byte order setting accordingly. In native mode, the leading BOM
4989 mark is skipped, in all other modes, it is copied to the output
4990 stream as-is (giving a ZWNBSP character). */
4991 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004992 if (size >= 2) {
4993 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004994#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 if (bom == 0xFEFF) {
4996 q += 2;
4997 bo = -1;
4998 }
4999 else if (bom == 0xFFFE) {
5000 q += 2;
5001 bo = 1;
5002 }
Tim Petersced69f82003-09-16 20:30:58 +00005003#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 if (bom == 0xFEFF) {
5005 q += 2;
5006 bo = 1;
5007 }
5008 else if (bom == 0xFFFE) {
5009 q += 2;
5010 bo = -1;
5011 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005012#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015
Tim Peters772747b2001-08-09 22:21:55 +00005016 if (bo == -1) {
5017 /* force LE */
5018 ihi = 1;
5019 ilo = 0;
5020 }
5021 else if (bo == 1) {
5022 /* force BE */
5023 ihi = 0;
5024 ilo = 1;
5025 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005026#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5027 native_ordering = ilo < ihi;
5028#else
5029 native_ordering = ilo > ihi;
5030#endif
Tim Peters772747b2001-08-09 22:21:55 +00005031
Antoine Pitrouab868312009-01-10 15:40:25 +00005032 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005033 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005035 /* First check for possible aligned read of a C 'long'. Unaligned
5036 reads are more expensive, better to defer to another iteration. */
5037 if (!((size_t) q & LONG_PTR_MASK)) {
5038 /* Fast path for runs of non-surrogate chars. */
5039 register const unsigned char *_q = q;
5040 Py_UNICODE *_p = p;
5041 if (native_ordering) {
5042 /* Native ordering is simple: as long as the input cannot
5043 possibly contain a surrogate char, do an unrolled copy
5044 of several 16-bit code points to the target object.
5045 The non-surrogate check is done on several input bytes
5046 at a time (as many as a C 'long' can contain). */
5047 while (_q < aligned_end) {
5048 unsigned long data = * (unsigned long *) _q;
5049 if (data & FAST_CHAR_MASK)
5050 break;
5051 _p[0] = ((unsigned short *) _q)[0];
5052 _p[1] = ((unsigned short *) _q)[1];
5053#if (SIZEOF_LONG == 8)
5054 _p[2] = ((unsigned short *) _q)[2];
5055 _p[3] = ((unsigned short *) _q)[3];
5056#endif
5057 _q += SIZEOF_LONG;
5058 _p += SIZEOF_LONG / 2;
5059 }
5060 }
5061 else {
5062 /* Byteswapped ordering is similar, but we must decompose
5063 the copy bytewise, and take care of zero'ing out the
5064 upper bytes if the target object is in 32-bit units
5065 (that is, in UCS-4 builds). */
5066 while (_q < aligned_end) {
5067 unsigned long data = * (unsigned long *) _q;
5068 if (data & SWAPPED_FAST_CHAR_MASK)
5069 break;
5070 /* Zero upper bytes in UCS-4 builds */
5071#if (Py_UNICODE_SIZE > 2)
5072 _p[0] = 0;
5073 _p[1] = 0;
5074#if (SIZEOF_LONG == 8)
5075 _p[2] = 0;
5076 _p[3] = 0;
5077#endif
5078#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005079 /* Issue #4916; UCS-4 builds on big endian machines must
5080 fill the two last bytes of each 4-byte unit. */
5081#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5082# define OFF 2
5083#else
5084# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005085#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005086 ((unsigned char *) _p)[OFF + 1] = _q[0];
5087 ((unsigned char *) _p)[OFF + 0] = _q[1];
5088 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5089 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5090#if (SIZEOF_LONG == 8)
5091 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5092 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5093 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5094 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5095#endif
5096#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005097 _q += SIZEOF_LONG;
5098 _p += SIZEOF_LONG / 2;
5099 }
5100 }
5101 p = _p;
5102 q = _q;
5103 if (q >= e)
5104 break;
5105 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107
Benjamin Peterson14339b62009-01-31 16:36:08 +00005108 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005109
5110 if (ch < 0xD800 || ch > 0xDFFF) {
5111 *p++ = ch;
5112 continue;
5113 }
5114
5115 /* UTF-16 code pair: */
5116 if (q > e) {
5117 errmsg = "unexpected end of data";
5118 startinpos = (((const char *)q) - 2) - starts;
5119 endinpos = ((const char *)e) + 1 - starts;
5120 goto utf16Error;
5121 }
5122 if (0xD800 <= ch && ch <= 0xDBFF) {
5123 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5124 q += 2;
5125 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005126#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 *p++ = ch;
5128 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005129#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005131#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 continue;
5133 }
5134 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005135 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 startinpos = (((const char *)q)-4)-starts;
5137 endinpos = startinpos+2;
5138 goto utf16Error;
5139 }
5140
Benjamin Peterson14339b62009-01-31 16:36:08 +00005141 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 errmsg = "illegal encoding";
5143 startinpos = (((const char *)q)-2)-starts;
5144 endinpos = startinpos+2;
5145 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005146
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 utf16Error:
5148 outpos = p - PyUnicode_AS_UNICODE(unicode);
5149 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005150 errors,
5151 &errorHandler,
5152 "utf16", errmsg,
5153 &starts,
5154 (const char **)&e,
5155 &startinpos,
5156 &endinpos,
5157 &exc,
5158 (const char **)&q,
5159 &unicode,
5160 &outpos,
5161 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005164 /* remaining byte at the end? (size should be even) */
5165 if (e == q) {
5166 if (!consumed) {
5167 errmsg = "truncated data";
5168 startinpos = ((const char *)q) - starts;
5169 endinpos = ((const char *)e) + 1 - starts;
5170 outpos = p - PyUnicode_AS_UNICODE(unicode);
5171 if (unicode_decode_call_errorhandler(
5172 errors,
5173 &errorHandler,
5174 "utf16", errmsg,
5175 &starts,
5176 (const char **)&e,
5177 &startinpos,
5178 &endinpos,
5179 &exc,
5180 (const char **)&q,
5181 &unicode,
5182 &outpos,
5183 &p))
5184 goto onError;
5185 /* The remaining input chars are ignored if the callback
5186 chooses to skip the input */
5187 }
5188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189
5190 if (byteorder)
5191 *byteorder = bo;
5192
Walter Dörwald69652032004-09-07 20:24:22 +00005193 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005195
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005197 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 goto onError;
5199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 Py_XDECREF(errorHandler);
5201 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005202#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005203 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005204 Py_DECREF(unicode);
5205 return NULL;
5206 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005207#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 return (PyObject *)unicode;
5209
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005212 Py_XDECREF(errorHandler);
5213 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 return NULL;
5215}
5216
Antoine Pitrouab868312009-01-10 15:40:25 +00005217#undef FAST_CHAR_MASK
5218#undef SWAPPED_FAST_CHAR_MASK
5219
Tim Peters772747b2001-08-09 22:21:55 +00005220PyObject *
5221PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 Py_ssize_t size,
5223 const char *errors,
5224 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005226 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005227 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005228 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005229#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005230 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005231#else
5232 const int pairs = 0;
5233#endif
Tim Peters772747b2001-08-09 22:21:55 +00005234 /* Offsets from p for storing byte pairs in the right order. */
5235#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5236 int ihi = 1, ilo = 0;
5237#else
5238 int ihi = 0, ilo = 1;
5239#endif
5240
Benjamin Peterson29060642009-01-31 22:14:21 +00005241#define STORECHAR(CH) \
5242 do { \
5243 p[ihi] = ((CH) >> 8) & 0xff; \
5244 p[ilo] = (CH) & 0xff; \
5245 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005246 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005248#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005249 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 if (s[i] >= 0x10000)
5251 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005252#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005253 /* 2 * (size + pairs + (byteorder == 0)) */
5254 if (size > PY_SSIZE_T_MAX ||
5255 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005257 nsize = size + pairs + (byteorder == 0);
5258 bytesize = nsize * 2;
5259 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005261 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 if (v == NULL)
5263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005265 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005268 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005269 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005270
5271 if (byteorder == -1) {
5272 /* force LE */
5273 ihi = 1;
5274 ilo = 0;
5275 }
5276 else if (byteorder == 1) {
5277 /* force BE */
5278 ihi = 0;
5279 ilo = 1;
5280 }
5281
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005282 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 Py_UNICODE ch = *s++;
5284 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005285#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 if (ch >= 0x10000) {
5287 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5288 ch = 0xD800 | ((ch-0x10000) >> 10);
5289 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005290#endif
Tim Peters772747b2001-08-09 22:21:55 +00005291 STORECHAR(ch);
5292 if (ch2)
5293 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005294 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005295
5296 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005297 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005298#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299}
5300
Alexander Belopolsky40018472011-02-26 01:02:56 +00005301PyObject *
5302PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303{
5304 if (!PyUnicode_Check(unicode)) {
5305 PyErr_BadArgument();
5306 return NULL;
5307 }
5308 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 PyUnicode_GET_SIZE(unicode),
5310 NULL,
5311 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312}
5313
5314/* --- Unicode Escape Codec ----------------------------------------------- */
5315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005316/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5317 if all the escapes in the string make it still a valid ASCII string.
5318 Returns -1 if any escapes were found which cause the string to
5319 pop out of ASCII range. Otherwise returns the length of the
5320 required buffer to hold the string.
5321 */
5322Py_ssize_t
5323length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5324{
5325 const unsigned char *p = (const unsigned char *)s;
5326 const unsigned char *end = p + size;
5327 Py_ssize_t length = 0;
5328
5329 if (size < 0)
5330 return -1;
5331
5332 for (; p < end; ++p) {
5333 if (*p > 127) {
5334 /* Non-ASCII */
5335 return -1;
5336 }
5337 else if (*p != '\\') {
5338 /* Normal character */
5339 ++length;
5340 }
5341 else {
5342 /* Backslash-escape, check next char */
5343 ++p;
5344 /* Escape sequence reaches till end of string or
5345 non-ASCII follow-up. */
5346 if (p >= end || *p > 127)
5347 return -1;
5348 switch (*p) {
5349 case '\n':
5350 /* backslash + \n result in zero characters */
5351 break;
5352 case '\\': case '\'': case '\"':
5353 case 'b': case 'f': case 't':
5354 case 'n': case 'r': case 'v': case 'a':
5355 ++length;
5356 break;
5357 case '0': case '1': case '2': case '3':
5358 case '4': case '5': case '6': case '7':
5359 case 'x': case 'u': case 'U': case 'N':
5360 /* these do not guarantee ASCII characters */
5361 return -1;
5362 default:
5363 /* count the backslash + the other character */
5364 length += 2;
5365 }
5366 }
5367 }
5368 return length;
5369}
5370
5371/* Similar to PyUnicode_WRITE but either write into wstr field
5372 or treat string as ASCII. */
5373#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5374 do { \
5375 if ((kind) != PyUnicode_WCHAR_KIND) \
5376 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5377 else \
5378 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5379 } while (0)
5380
5381#define WRITE_WSTR(buf, index, value) \
5382 assert(kind == PyUnicode_WCHAR_KIND), \
5383 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5384
5385
Fredrik Lundh06d12682001-01-24 07:59:11 +00005386static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005387
Alexander Belopolsky40018472011-02-26 01:02:56 +00005388PyObject *
5389PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005390 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005391 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005394 Py_ssize_t startinpos;
5395 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005396 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005400 char* message;
5401 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005402 PyObject *errorHandler = NULL;
5403 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005404 Py_ssize_t ascii_length;
5405 Py_ssize_t i;
5406 int kind;
5407 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005409 ascii_length = length_of_escaped_ascii_string(s, size);
5410
5411 /* After length_of_escaped_ascii_string() there are two alternatives,
5412 either the string is pure ASCII with named escapes like \n, etc.
5413 and we determined it's exact size (common case)
5414 or it contains \x, \u, ... escape sequences. then we create a
5415 legacy wchar string and resize it at the end of this function. */
5416 if (ascii_length >= 0) {
5417 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5418 if (!v)
5419 goto onError;
5420 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5421 kind = PyUnicode_1BYTE_KIND;
5422 data = PyUnicode_DATA(v);
5423 }
5424 else {
5425 /* Escaped strings will always be longer than the resulting
5426 Unicode string, so we start with size here and then reduce the
5427 length after conversion to the true value.
5428 (but if the error callback returns a long replacement string
5429 we'll have to allocate more space) */
5430 v = _PyUnicode_New(size);
5431 if (!v)
5432 goto onError;
5433 kind = PyUnicode_WCHAR_KIND;
5434 data = PyUnicode_AS_UNICODE(v);
5435 }
5436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 if (size == 0)
5438 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005441
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 while (s < end) {
5443 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005444 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005447 if (kind == PyUnicode_WCHAR_KIND) {
5448 assert(i < _PyUnicode_WSTR_LENGTH(v));
5449 }
5450 else {
5451 /* The only case in which i == ascii_length is a backslash
5452 followed by a newline. */
5453 assert(i <= ascii_length);
5454 }
5455
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 /* Non-escape characters are interpreted as Unicode ordinals */
5457 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005458 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 continue;
5460 }
5461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 /* \ - Escapes */
5464 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005465 c = *s++;
5466 if (s > end)
5467 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005468
5469 if (kind == PyUnicode_WCHAR_KIND) {
5470 assert(i < _PyUnicode_WSTR_LENGTH(v));
5471 }
5472 else {
5473 /* The only case in which i == ascii_length is a backslash
5474 followed by a newline. */
5475 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5476 }
5477
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005478 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5483 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5484 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5485 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5486 /* FF */
5487 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5488 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5489 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5490 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5491 /* VT */
5492 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5493 /* BEL, not classic C */
5494 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 case '0': case '1': case '2': case '3':
5498 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005499 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005500 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005501 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005502 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005503 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005505 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 break;
5507
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 /* hex escapes */
5509 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005511 digits = 2;
5512 message = "truncated \\xXX escape";
5513 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005517 digits = 4;
5518 message = "truncated \\uXXXX escape";
5519 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005522 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005523 digits = 8;
5524 message = "truncated \\UXXXXXXXX escape";
5525 hexescape:
5526 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005527 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 if (s+digits>end) {
5529 endinpos = size;
5530 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 errors, &errorHandler,
5532 "unicodeescape", "end of string in escape sequence",
5533 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005534 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005535 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 goto nextByte;
5538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005539 for (j = 0; j < digits; ++j) {
5540 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005541 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005542 endinpos = (s+j+1)-starts;
5543 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 errors, &errorHandler,
5546 "unicodeescape", message,
5547 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005549 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005552 }
5553 chr = (chr<<4) & ~0xF;
5554 if (c >= '0' && c <= '9')
5555 chr += c - '0';
5556 else if (c >= 'a' && c <= 'f')
5557 chr += 10 + c - 'a';
5558 else
5559 chr += 10 + c - 'A';
5560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005562 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 /* _decoding_error will have already written into the
5564 target buffer. */
5565 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005566 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005567 /* when we get here, chr is a 32-bit unicode character */
5568 if (chr <= 0xffff)
5569 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005571 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005572 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005573 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005574#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005576#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005577 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5579 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005580#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005581 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005584 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 errors, &errorHandler,
5586 "unicodeescape", "illegal Unicode character",
5587 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005588 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005589 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005590 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005591 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005592 break;
5593
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 case 'N':
5596 message = "malformed \\N character escape";
5597 if (ucnhash_CAPI == NULL) {
5598 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005599 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5600 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005601 if (ucnhash_CAPI == NULL)
5602 goto ucnhashError;
5603 }
5604 if (*s == '{') {
5605 const char *start = s+1;
5606 /* look for the closing brace */
5607 while (*s != '}' && s < end)
5608 s++;
5609 if (s > start && s < end && *s == '}') {
5610 /* found a name. look it up in the unicode database */
5611 message = "unknown Unicode character name";
5612 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5614 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005615 goto store;
5616 }
5617 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005619 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 errors, &errorHandler,
5622 "unicodeescape", message,
5623 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005625 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005627 break;
5628
5629 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005630 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632 message = "\\ at end of string";
5633 s--;
5634 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 errors, &errorHandler,
5638 "unicodeescape", message,
5639 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005641 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005643 }
5644 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5646 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005647 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005648 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653 /* Ensure the length prediction worked in case of ASCII strings */
5654 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5655
Victor Stinnerfe226c02011-10-03 03:52:20 +02005656 if (kind == PyUnicode_WCHAR_KIND)
5657 {
5658 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5659 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005660 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005661 Py_XDECREF(errorHandler);
5662 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005663#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005664 if (_PyUnicode_READY_REPLACE(&v)) {
5665 Py_DECREF(v);
5666 return NULL;
5667 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005668#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005670
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005672 PyErr_SetString(
5673 PyExc_UnicodeError,
5674 "\\N escapes not supported (can't load unicodedata module)"
5675 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005676 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 Py_XDECREF(errorHandler);
5678 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005679 return NULL;
5680
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 Py_XDECREF(errorHandler);
5684 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 return NULL;
5686}
5687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688#undef WRITE_ASCII_OR_WSTR
5689#undef WRITE_WSTR
5690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691/* Return a Unicode-Escape string version of the Unicode object.
5692
5693 If quotes is true, the string is enclosed in u"" or u'' quotes as
5694 appropriate.
5695
5696*/
5697
Walter Dörwald79e913e2007-05-12 11:08:06 +00005698static const char *hexdigits = "0123456789abcdef";
5699
Alexander Belopolsky40018472011-02-26 01:02:56 +00005700PyObject *
5701PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005702 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005704 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005707#ifdef Py_UNICODE_WIDE
5708 const Py_ssize_t expandsize = 10;
5709#else
5710 const Py_ssize_t expandsize = 6;
5711#endif
5712
Thomas Wouters89f507f2006-12-13 04:49:30 +00005713 /* XXX(nnorwitz): rather than over-allocating, it would be
5714 better to choose a different scheme. Perhaps scan the
5715 first N-chars of the string and allocate based on that size.
5716 */
5717 /* Initial allocation is based on the longest-possible unichr
5718 escape.
5719
5720 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5721 unichr, so in this case it's the longest unichr escape. In
5722 narrow (UTF-16) builds this is five chars per source unichr
5723 since there are two unichrs in the surrogate pair, so in narrow
5724 (UTF-16) builds it's not the longest unichr escape.
5725
5726 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5727 so in the narrow (UTF-16) build case it's the longest unichr
5728 escape.
5729 */
5730
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005731 if (size == 0)
5732 return PyBytes_FromStringAndSize(NULL, 0);
5733
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005734 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005736
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005737 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 2
5739 + expandsize*size
5740 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 if (repr == NULL)
5742 return NULL;
5743
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005744 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 while (size-- > 0) {
5747 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005748
Walter Dörwald79e913e2007-05-12 11:08:06 +00005749 /* Escape backslashes */
5750 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 *p++ = '\\';
5752 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005753 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005754 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005755
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005756#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005757 /* Map 21-bit characters to '\U00xxxxxx' */
5758 else if (ch >= 0x10000) {
5759 *p++ = '\\';
5760 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005761 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5762 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5763 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5764 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5765 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5766 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5767 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5768 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005770 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005771#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5773 else if (ch >= 0xD800 && ch < 0xDC00) {
5774 Py_UNICODE ch2;
5775 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005776
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 ch2 = *s++;
5778 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005779 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5781 *p++ = '\\';
5782 *p++ = 'U';
5783 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5784 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5785 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5786 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5787 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5788 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5789 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5790 *p++ = hexdigits[ucs & 0x0000000F];
5791 continue;
5792 }
5793 /* Fall through: isolated surrogates are copied as-is */
5794 s--;
5795 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005796 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005797#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005798
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005800 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 *p++ = '\\';
5802 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005803 *p++ = hexdigits[(ch >> 12) & 0x000F];
5804 *p++ = hexdigits[(ch >> 8) & 0x000F];
5805 *p++ = hexdigits[(ch >> 4) & 0x000F];
5806 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005808
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005809 /* Map special whitespace to '\t', \n', '\r' */
5810 else if (ch == '\t') {
5811 *p++ = '\\';
5812 *p++ = 't';
5813 }
5814 else if (ch == '\n') {
5815 *p++ = '\\';
5816 *p++ = 'n';
5817 }
5818 else if (ch == '\r') {
5819 *p++ = '\\';
5820 *p++ = 'r';
5821 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005822
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005823 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005824 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005826 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005827 *p++ = hexdigits[(ch >> 4) & 0x000F];
5828 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005829 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005830
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 /* Copy everything else as-is */
5832 else
5833 *p++ = (char) ch;
5834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005836 assert(p - PyBytes_AS_STRING(repr) > 0);
5837 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5838 return NULL;
5839 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840}
5841
Alexander Belopolsky40018472011-02-26 01:02:56 +00005842PyObject *
5843PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005845 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 if (!PyUnicode_Check(unicode)) {
5847 PyErr_BadArgument();
5848 return NULL;
5849 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005850 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5851 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005852 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853}
5854
5855/* --- Raw Unicode Escape Codec ------------------------------------------- */
5856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
5858PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005859 Py_ssize_t size,
5860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t startinpos;
5864 Py_ssize_t endinpos;
5865 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 const char *end;
5869 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 PyObject *errorHandler = NULL;
5871 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005872
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 /* Escaped strings will always be longer than the resulting
5874 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 length after conversion to the true value. (But decoding error
5876 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 v = _PyUnicode_New(size);
5878 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 end = s + size;
5884 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 unsigned char c;
5886 Py_UCS4 x;
5887 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005888 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 /* Non-escape characters are interpreted as Unicode ordinals */
5891 if (*s != '\\') {
5892 *p++ = (unsigned char)*s++;
5893 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005894 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 startinpos = s-starts;
5896
5897 /* \u-escapes are only interpreted iff the number of leading
5898 backslashes if odd */
5899 bs = s;
5900 for (;s < end;) {
5901 if (*s != '\\')
5902 break;
5903 *p++ = (unsigned char)*s++;
5904 }
5905 if (((s - bs) & 1) == 0 ||
5906 s >= end ||
5907 (*s != 'u' && *s != 'U')) {
5908 continue;
5909 }
5910 p--;
5911 count = *s=='u' ? 4 : 8;
5912 s++;
5913
5914 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5915 outpos = p-PyUnicode_AS_UNICODE(v);
5916 for (x = 0, i = 0; i < count; ++i, ++s) {
5917 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005918 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 endinpos = s-starts;
5920 if (unicode_decode_call_errorhandler(
5921 errors, &errorHandler,
5922 "rawunicodeescape", "truncated \\uXXXX",
5923 &starts, &end, &startinpos, &endinpos, &exc, &s,
5924 &v, &outpos, &p))
5925 goto onError;
5926 goto nextByte;
5927 }
5928 x = (x<<4) & ~0xF;
5929 if (c >= '0' && c <= '9')
5930 x += c - '0';
5931 else if (c >= 'a' && c <= 'f')
5932 x += 10 + c - 'a';
5933 else
5934 x += 10 + c - 'A';
5935 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005936 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 /* UCS-2 character */
5938 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005939 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 /* UCS-4 character. Either store directly, or as
5941 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005942#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005944#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 x -= 0x10000L;
5946 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5947 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005948#endif
5949 } else {
5950 endinpos = s-starts;
5951 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005952 if (unicode_decode_call_errorhandler(
5953 errors, &errorHandler,
5954 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 &starts, &end, &startinpos, &endinpos, &exc, &s,
5956 &v, &outpos, &p))
5957 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 nextByte:
5960 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005962 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005964 Py_XDECREF(errorHandler);
5965 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005966#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005967 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968 Py_DECREF(v);
5969 return NULL;
5970 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005971#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005973
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 Py_XDECREF(errorHandler);
5977 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 return NULL;
5979}
5980
Alexander Belopolsky40018472011-02-26 01:02:56 +00005981PyObject *
5982PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005983 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005985 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 char *p;
5987 char *q;
5988
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005989#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005990 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005991#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005992 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005993#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005994
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005995 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005997
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005998 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 if (repr == NULL)
6000 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006001 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006002 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006004 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 while (size-- > 0) {
6006 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006007#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 /* Map 32-bit characters to '\Uxxxxxxxx' */
6009 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006010 *p++ = '\\';
6011 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006012 *p++ = hexdigits[(ch >> 28) & 0xf];
6013 *p++ = hexdigits[(ch >> 24) & 0xf];
6014 *p++ = hexdigits[(ch >> 20) & 0xf];
6015 *p++ = hexdigits[(ch >> 16) & 0xf];
6016 *p++ = hexdigits[(ch >> 12) & 0xf];
6017 *p++ = hexdigits[(ch >> 8) & 0xf];
6018 *p++ = hexdigits[(ch >> 4) & 0xf];
6019 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006020 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006021 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006022#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6024 if (ch >= 0xD800 && ch < 0xDC00) {
6025 Py_UNICODE ch2;
6026 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006027
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 ch2 = *s++;
6029 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006030 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6032 *p++ = '\\';
6033 *p++ = 'U';
6034 *p++ = hexdigits[(ucs >> 28) & 0xf];
6035 *p++ = hexdigits[(ucs >> 24) & 0xf];
6036 *p++ = hexdigits[(ucs >> 20) & 0xf];
6037 *p++ = hexdigits[(ucs >> 16) & 0xf];
6038 *p++ = hexdigits[(ucs >> 12) & 0xf];
6039 *p++ = hexdigits[(ucs >> 8) & 0xf];
6040 *p++ = hexdigits[(ucs >> 4) & 0xf];
6041 *p++ = hexdigits[ucs & 0xf];
6042 continue;
6043 }
6044 /* Fall through: isolated surrogates are copied as-is */
6045 s--;
6046 size++;
6047 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006048#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 /* Map 16-bit characters to '\uxxxx' */
6050 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 *p++ = '\\';
6052 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006053 *p++ = hexdigits[(ch >> 12) & 0xf];
6054 *p++ = hexdigits[(ch >> 8) & 0xf];
6055 *p++ = hexdigits[(ch >> 4) & 0xf];
6056 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 /* Copy everything else as-is */
6059 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 *p++ = (char) ch;
6061 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006062 size = p - q;
6063
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006064 assert(size > 0);
6065 if (_PyBytes_Resize(&repr, size) < 0)
6066 return NULL;
6067 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068}
6069
Alexander Belopolsky40018472011-02-26 01:02:56 +00006070PyObject *
6071PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006073 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006075 PyErr_BadArgument();
6076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006078 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6079 PyUnicode_GET_SIZE(unicode));
6080
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006081 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082}
6083
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006084/* --- Unicode Internal Codec ------------------------------------------- */
6085
Alexander Belopolsky40018472011-02-26 01:02:56 +00006086PyObject *
6087_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006088 Py_ssize_t size,
6089 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006090{
6091 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006092 Py_ssize_t startinpos;
6093 Py_ssize_t endinpos;
6094 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006095 PyUnicodeObject *v;
6096 Py_UNICODE *p;
6097 const char *end;
6098 const char *reason;
6099 PyObject *errorHandler = NULL;
6100 PyObject *exc = NULL;
6101
Neal Norwitzd43069c2006-01-08 01:12:10 +00006102#ifdef Py_UNICODE_WIDE
6103 Py_UNICODE unimax = PyUnicode_GetMax();
6104#endif
6105
Thomas Wouters89f507f2006-12-13 04:49:30 +00006106 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006107 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6108 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006110 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6111 as string was created with the old API. */
6112 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006114 p = PyUnicode_AS_UNICODE(v);
6115 end = s + size;
6116
6117 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006118 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006119 /* We have to sanity check the raw data, otherwise doom looms for
6120 some malformed UCS-4 data. */
6121 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006122#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006123 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006124#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006125 end-s < Py_UNICODE_SIZE
6126 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006128 startinpos = s - starts;
6129 if (end-s < Py_UNICODE_SIZE) {
6130 endinpos = end-starts;
6131 reason = "truncated input";
6132 }
6133 else {
6134 endinpos = s - starts + Py_UNICODE_SIZE;
6135 reason = "illegal code point (> 0x10FFFF)";
6136 }
6137 outpos = p - PyUnicode_AS_UNICODE(v);
6138 if (unicode_decode_call_errorhandler(
6139 errors, &errorHandler,
6140 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006141 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006142 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006143 goto onError;
6144 }
6145 }
6146 else {
6147 p++;
6148 s += Py_UNICODE_SIZE;
6149 }
6150 }
6151
Victor Stinnerfe226c02011-10-03 03:52:20 +02006152 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006153 goto onError;
6154 Py_XDECREF(errorHandler);
6155 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006156#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006157 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006158 Py_DECREF(v);
6159 return NULL;
6160 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006161#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006162 return (PyObject *)v;
6163
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006165 Py_XDECREF(v);
6166 Py_XDECREF(errorHandler);
6167 Py_XDECREF(exc);
6168 return NULL;
6169}
6170
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171/* --- Latin-1 Codec ------------------------------------------------------ */
6172
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173PyObject *
6174PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006175 Py_ssize_t size,
6176 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006179 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180}
6181
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006182/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006183static void
6184make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006185 const char *encoding,
6186 const Py_UNICODE *unicode, Py_ssize_t size,
6187 Py_ssize_t startpos, Py_ssize_t endpos,
6188 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 *exceptionObject = PyUnicodeEncodeError_Create(
6192 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 }
6194 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6196 goto onError;
6197 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6198 goto onError;
6199 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6200 goto onError;
6201 return;
6202 onError:
6203 Py_DECREF(*exceptionObject);
6204 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 }
6206}
6207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006209static void
6210raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006211 const char *encoding,
6212 const Py_UNICODE *unicode, Py_ssize_t size,
6213 Py_ssize_t startpos, Py_ssize_t endpos,
6214 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006215{
6216 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006218 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220}
6221
6222/* error handling callback helper:
6223 build arguments, call the callback and check the arguments,
6224 put the result into newpos and return the replacement string, which
6225 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006226static PyObject *
6227unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006228 PyObject **errorHandler,
6229 const char *encoding, const char *reason,
6230 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6231 Py_ssize_t startpos, Py_ssize_t endpos,
6232 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006233{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006234 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006235
6236 PyObject *restuple;
6237 PyObject *resunicode;
6238
6239 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006243 }
6244
6245 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006247 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006249
6250 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006252 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006255 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 Py_DECREF(restuple);
6257 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006259 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 &resunicode, newpos)) {
6261 Py_DECREF(restuple);
6262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006264 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6265 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6266 Py_DECREF(restuple);
6267 return NULL;
6268 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006269 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006271 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6273 Py_DECREF(restuple);
6274 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006275 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006276 Py_INCREF(resunicode);
6277 Py_DECREF(restuple);
6278 return resunicode;
6279}
6280
Alexander Belopolsky40018472011-02-26 01:02:56 +00006281static PyObject *
6282unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006283 Py_ssize_t size,
6284 const char *errors,
6285 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006286{
6287 /* output object */
6288 PyObject *res;
6289 /* pointers to the beginning and end+1 of input */
6290 const Py_UNICODE *startp = p;
6291 const Py_UNICODE *endp = p + size;
6292 /* pointer to the beginning of the unencodable characters */
6293 /* const Py_UNICODE *badp = NULL; */
6294 /* pointer into the output */
6295 char *str;
6296 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006297 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006298 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6299 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 PyObject *errorHandler = NULL;
6301 PyObject *exc = NULL;
6302 /* the following variable is used for caching string comparisons
6303 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6304 int known_errorHandler = -1;
6305
6306 /* allocate enough for a simple encoding without
6307 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006308 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006309 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006310 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006312 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006313 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006314 ressize = size;
6315
6316 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* can we encode this? */
6320 if (c<limit) {
6321 /* no overflow check, because we know that the space is enough */
6322 *str++ = (char)c;
6323 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006324 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 else {
6326 Py_ssize_t unicodepos = p-startp;
6327 Py_ssize_t requiredsize;
6328 PyObject *repunicode;
6329 Py_ssize_t repsize;
6330 Py_ssize_t newpos;
6331 Py_ssize_t respos;
6332 Py_UNICODE *uni2;
6333 /* startpos for collecting unencodable chars */
6334 const Py_UNICODE *collstart = p;
6335 const Py_UNICODE *collend = p;
6336 /* find all unecodable characters */
6337 while ((collend < endp) && ((*collend)>=limit))
6338 ++collend;
6339 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6340 if (known_errorHandler==-1) {
6341 if ((errors==NULL) || (!strcmp(errors, "strict")))
6342 known_errorHandler = 1;
6343 else if (!strcmp(errors, "replace"))
6344 known_errorHandler = 2;
6345 else if (!strcmp(errors, "ignore"))
6346 known_errorHandler = 3;
6347 else if (!strcmp(errors, "xmlcharrefreplace"))
6348 known_errorHandler = 4;
6349 else
6350 known_errorHandler = 0;
6351 }
6352 switch (known_errorHandler) {
6353 case 1: /* strict */
6354 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6355 goto onError;
6356 case 2: /* replace */
6357 while (collstart++<collend)
6358 *str++ = '?'; /* fall through */
6359 case 3: /* ignore */
6360 p = collend;
6361 break;
6362 case 4: /* xmlcharrefreplace */
6363 respos = str - PyBytes_AS_STRING(res);
6364 /* determine replacement size (temporarily (mis)uses p) */
6365 for (p = collstart, repsize = 0; p < collend; ++p) {
6366 if (*p<10)
6367 repsize += 2+1+1;
6368 else if (*p<100)
6369 repsize += 2+2+1;
6370 else if (*p<1000)
6371 repsize += 2+3+1;
6372 else if (*p<10000)
6373 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006374#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 else
6376 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006377#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 else if (*p<100000)
6379 repsize += 2+5+1;
6380 else if (*p<1000000)
6381 repsize += 2+6+1;
6382 else
6383 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006384#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 }
6386 requiredsize = respos+repsize+(endp-collend);
6387 if (requiredsize > ressize) {
6388 if (requiredsize<2*ressize)
6389 requiredsize = 2*ressize;
6390 if (_PyBytes_Resize(&res, requiredsize))
6391 goto onError;
6392 str = PyBytes_AS_STRING(res) + respos;
6393 ressize = requiredsize;
6394 }
6395 /* generate replacement (temporarily (mis)uses p) */
6396 for (p = collstart; p < collend; ++p) {
6397 str += sprintf(str, "&#%d;", (int)*p);
6398 }
6399 p = collend;
6400 break;
6401 default:
6402 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6403 encoding, reason, startp, size, &exc,
6404 collstart-startp, collend-startp, &newpos);
6405 if (repunicode == NULL)
6406 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006407 if (PyBytes_Check(repunicode)) {
6408 /* Directly copy bytes result to output. */
6409 repsize = PyBytes_Size(repunicode);
6410 if (repsize > 1) {
6411 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006412 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006413 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6414 Py_DECREF(repunicode);
6415 goto onError;
6416 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006417 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006418 ressize += repsize-1;
6419 }
6420 memcpy(str, PyBytes_AsString(repunicode), repsize);
6421 str += repsize;
6422 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006423 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006424 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006425 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 /* need more space? (at least enough for what we
6427 have+the replacement+the rest of the string, so
6428 we won't have to check space for encodable characters) */
6429 respos = str - PyBytes_AS_STRING(res);
6430 repsize = PyUnicode_GET_SIZE(repunicode);
6431 requiredsize = respos+repsize+(endp-collend);
6432 if (requiredsize > ressize) {
6433 if (requiredsize<2*ressize)
6434 requiredsize = 2*ressize;
6435 if (_PyBytes_Resize(&res, requiredsize)) {
6436 Py_DECREF(repunicode);
6437 goto onError;
6438 }
6439 str = PyBytes_AS_STRING(res) + respos;
6440 ressize = requiredsize;
6441 }
6442 /* check if there is anything unencodable in the replacement
6443 and copy it to the output */
6444 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6445 c = *uni2;
6446 if (c >= limit) {
6447 raise_encode_exception(&exc, encoding, startp, size,
6448 unicodepos, unicodepos+1, reason);
6449 Py_DECREF(repunicode);
6450 goto onError;
6451 }
6452 *str = (char)c;
6453 }
6454 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006455 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006457 }
6458 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006459 /* Resize if we allocated to much */
6460 size = str - PyBytes_AS_STRING(res);
6461 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006462 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006463 if (_PyBytes_Resize(&res, size) < 0)
6464 goto onError;
6465 }
6466
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467 Py_XDECREF(errorHandler);
6468 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006469 return res;
6470
6471 onError:
6472 Py_XDECREF(res);
6473 Py_XDECREF(errorHandler);
6474 Py_XDECREF(exc);
6475 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476}
6477
Alexander Belopolsky40018472011-02-26 01:02:56 +00006478PyObject *
6479PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006480 Py_ssize_t size,
6481 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484}
6485
Alexander Belopolsky40018472011-02-26 01:02:56 +00006486PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006487_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488{
6489 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 PyErr_BadArgument();
6491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006493 if (PyUnicode_READY(unicode) == -1)
6494 return NULL;
6495 /* Fast path: if it is a one-byte string, construct
6496 bytes object directly. */
6497 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6498 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6499 PyUnicode_GET_LENGTH(unicode));
6500 /* Non-Latin-1 characters present. Defer to above function to
6501 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006504 errors);
6505}
6506
6507PyObject*
6508PyUnicode_AsLatin1String(PyObject *unicode)
6509{
6510 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511}
6512
6513/* --- 7-bit ASCII Codec -------------------------------------------------- */
6514
Alexander Belopolsky40018472011-02-26 01:02:56 +00006515PyObject *
6516PyUnicode_DecodeASCII(const char *s,
6517 Py_ssize_t size,
6518 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006520 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006522 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006523 Py_ssize_t startinpos;
6524 Py_ssize_t endinpos;
6525 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006527 int has_error;
6528 const unsigned char *p = (const unsigned char *)s;
6529 const unsigned char *end = p + size;
6530 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006531 PyObject *errorHandler = NULL;
6532 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006533
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006535 if (size == 1 && (unsigned char)s[0] < 128)
6536 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006537
Victor Stinner702c7342011-10-05 13:50:52 +02006538 has_error = 0;
6539 while (p < end && !has_error) {
6540 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6541 an explanation. */
6542 if (!((size_t) p & LONG_PTR_MASK)) {
6543 /* Help register allocation */
6544 register const unsigned char *_p = p;
6545 while (_p < aligned_end) {
6546 unsigned long value = *(unsigned long *) _p;
6547 if (value & ASCII_CHAR_MASK) {
6548 has_error = 1;
6549 break;
6550 }
6551 _p += SIZEOF_LONG;
6552 }
6553 if (_p == end)
6554 break;
6555 if (has_error)
6556 break;
6557 p = _p;
6558 }
6559 if (*p & 0x80) {
6560 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006561 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006562 }
6563 else {
6564 ++p;
6565 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006566 }
Victor Stinner702c7342011-10-05 13:50:52 +02006567 if (!has_error)
6568 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006569
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 v = _PyUnicode_New(size);
6571 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006575 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576 e = s + size;
6577 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 register unsigned char c = (unsigned char)*s;
6579 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006580 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 ++s;
6582 }
6583 else {
6584 startinpos = s-starts;
6585 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006586 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 if (unicode_decode_call_errorhandler(
6588 errors, &errorHandler,
6589 "ascii", "ordinal not in range(128)",
6590 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006591 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 goto onError;
6593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 }
Victor Stinner702c7342011-10-05 13:50:52 +02006595 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6596 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006598 Py_XDECREF(errorHandler);
6599 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006600#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006601 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006602 Py_DECREF(v);
6603 return NULL;
6604 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006605#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006607
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610 Py_XDECREF(errorHandler);
6611 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 return NULL;
6613}
6614
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615PyObject *
6616PyUnicode_EncodeASCII(const Py_UNICODE *p,
6617 Py_ssize_t size,
6618 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621}
6622
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006624_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625{
6626 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 PyErr_BadArgument();
6628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006630 if (PyUnicode_READY(unicode) == -1)
6631 return NULL;
6632 /* Fast path: if it is an ASCII-only string, construct bytes object
6633 directly. Else defer to above function to raise the exception. */
6634 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6635 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6636 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006639 errors);
6640}
6641
6642PyObject *
6643PyUnicode_AsASCIIString(PyObject *unicode)
6644{
6645 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646}
6647
Victor Stinner99b95382011-07-04 14:23:54 +02006648#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006649
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006650/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006651
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006652#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006653#define NEED_RETRY
6654#endif
6655
6656/* XXX This code is limited to "true" double-byte encodings, as
6657 a) it assumes an incomplete character consists of a single byte, and
6658 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006660
Alexander Belopolsky40018472011-02-26 01:02:56 +00006661static int
6662is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006663{
6664 const char *curr = s + offset;
6665
6666 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 const char *prev = CharPrev(s, curr);
6668 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006669 }
6670 return 0;
6671}
6672
6673/*
6674 * Decode MBCS string into unicode object. If 'final' is set, converts
6675 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6676 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006677static int
6678decode_mbcs(PyUnicodeObject **v,
6679 const char *s, /* MBCS string */
6680 int size, /* sizeof MBCS string */
6681 int final,
6682 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683{
6684 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006685 Py_ssize_t n;
6686 DWORD usize;
6687 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006688
6689 assert(size >= 0);
6690
Victor Stinner554f3f02010-06-16 23:33:54 +00006691 /* check and handle 'errors' arg */
6692 if (errors==NULL || strcmp(errors, "strict")==0)
6693 flags = MB_ERR_INVALID_CHARS;
6694 else if (strcmp(errors, "ignore")==0)
6695 flags = 0;
6696 else {
6697 PyErr_Format(PyExc_ValueError,
6698 "mbcs encoding does not support errors='%s'",
6699 errors);
6700 return -1;
6701 }
6702
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703 /* Skip trailing lead-byte unless 'final' is set */
6704 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706
6707 /* First get the size of the result */
6708 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006709 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6710 if (usize==0)
6711 goto mbcs_decode_error;
6712 } else
6713 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714
6715 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 /* Create unicode object */
6717 *v = _PyUnicode_New(usize);
6718 if (*v == NULL)
6719 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006720 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006721 }
6722 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 /* Extend unicode object */
6724 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006725 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006727 }
6728
6729 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006730 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006732 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6733 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006735 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006736 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006737
6738mbcs_decode_error:
6739 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6740 we raise a UnicodeDecodeError - else it is a 'generic'
6741 windows error
6742 */
6743 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6744 /* Ideally, we should get reason from FormatMessage - this
6745 is the Windows 2000 English version of the message
6746 */
6747 PyObject *exc = NULL;
6748 const char *reason = "No mapping for the Unicode character exists "
6749 "in the target multi-byte code page.";
6750 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6751 if (exc != NULL) {
6752 PyCodec_StrictErrors(exc);
6753 Py_DECREF(exc);
6754 }
6755 } else {
6756 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6757 }
6758 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006759}
6760
Alexander Belopolsky40018472011-02-26 01:02:56 +00006761PyObject *
6762PyUnicode_DecodeMBCSStateful(const char *s,
6763 Py_ssize_t size,
6764 const char *errors,
6765 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006766{
6767 PyUnicodeObject *v = NULL;
6768 int done;
6769
6770 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006772
6773#ifdef NEED_RETRY
6774 retry:
6775 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006776 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006777 else
6778#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006779 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006780
6781 if (done < 0) {
6782 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006784 }
6785
6786 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006788
6789#ifdef NEED_RETRY
6790 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 s += done;
6792 size -= done;
6793 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794 }
6795#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006796#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006797 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006798 Py_DECREF(v);
6799 return NULL;
6800 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006801#endif
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006802 return (PyObject *)v;
6803}
6804
Alexander Belopolsky40018472011-02-26 01:02:56 +00006805PyObject *
6806PyUnicode_DecodeMBCS(const char *s,
6807 Py_ssize_t size,
6808 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006809{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6811}
6812
6813/*
6814 * Convert unicode into string object (MBCS).
6815 * Returns 0 if succeed, -1 otherwise.
6816 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006817static int
6818encode_mbcs(PyObject **repr,
6819 const Py_UNICODE *p, /* unicode */
6820 int size, /* size of unicode */
6821 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822{
Victor Stinner554f3f02010-06-16 23:33:54 +00006823 BOOL usedDefaultChar = FALSE;
6824 BOOL *pusedDefaultChar;
6825 int mbcssize;
6826 Py_ssize_t n;
6827 PyObject *exc = NULL;
6828 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829
6830 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006831
Victor Stinner554f3f02010-06-16 23:33:54 +00006832 /* check and handle 'errors' arg */
6833 if (errors==NULL || strcmp(errors, "strict")==0) {
6834 flags = WC_NO_BEST_FIT_CHARS;
6835 pusedDefaultChar = &usedDefaultChar;
6836 } else if (strcmp(errors, "replace")==0) {
6837 flags = 0;
6838 pusedDefaultChar = NULL;
6839 } else {
6840 PyErr_Format(PyExc_ValueError,
6841 "mbcs encoding does not support errors='%s'",
6842 errors);
6843 return -1;
6844 }
6845
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006846 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006848 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6849 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 if (mbcssize == 0) {
6851 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6852 return -1;
6853 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006854 /* If we used a default char, then we failed! */
6855 if (pusedDefaultChar && *pusedDefaultChar)
6856 goto mbcs_encode_error;
6857 } else {
6858 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006859 }
6860
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 /* Create string object */
6863 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6864 if (*repr == NULL)
6865 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006866 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867 }
6868 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 /* Extend string object */
6870 n = PyBytes_Size(*repr);
6871 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6872 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873 }
6874
6875 /* Do the conversion */
6876 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006878 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6879 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6881 return -1;
6882 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006883 if (pusedDefaultChar && *pusedDefaultChar)
6884 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006887
6888mbcs_encode_error:
6889 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6890 Py_XDECREF(exc);
6891 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006892}
6893
Alexander Belopolsky40018472011-02-26 01:02:56 +00006894PyObject *
6895PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6896 Py_ssize_t size,
6897 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006898{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006899 PyObject *repr = NULL;
6900 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006901
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006905 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006906 else
6907#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006908 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006909
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006910 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 Py_XDECREF(repr);
6912 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006913 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006914
6915#ifdef NEED_RETRY
6916 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 p += INT_MAX;
6918 size -= INT_MAX;
6919 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920 }
6921#endif
6922
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006923 return repr;
6924}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006925
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926PyObject *
6927PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006928{
6929 if (!PyUnicode_Check(unicode)) {
6930 PyErr_BadArgument();
6931 return NULL;
6932 }
6933 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 PyUnicode_GET_SIZE(unicode),
6935 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006936}
6937
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938#undef NEED_RETRY
6939
Victor Stinner99b95382011-07-04 14:23:54 +02006940#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942/* --- Character Mapping Codec -------------------------------------------- */
6943
Alexander Belopolsky40018472011-02-26 01:02:56 +00006944PyObject *
6945PyUnicode_DecodeCharmap(const char *s,
6946 Py_ssize_t size,
6947 PyObject *mapping,
6948 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006951 Py_ssize_t startinpos;
6952 Py_ssize_t endinpos;
6953 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006954 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 PyUnicodeObject *v;
6956 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006957 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958 PyObject *errorHandler = NULL;
6959 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006960 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006961 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006962
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 /* Default to Latin-1 */
6964 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966
6967 v = _PyUnicode_New(size);
6968 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006973 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006974 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 mapstring = PyUnicode_AS_UNICODE(mapping);
6976 maplen = PyUnicode_GET_SIZE(mapping);
6977 while (s < e) {
6978 unsigned char ch = *s;
6979 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 if (ch < maplen)
6982 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 if (x == 0xfffe) {
6985 /* undefined mapping */
6986 outpos = p-PyUnicode_AS_UNICODE(v);
6987 startinpos = s-starts;
6988 endinpos = startinpos+1;
6989 if (unicode_decode_call_errorhandler(
6990 errors, &errorHandler,
6991 "charmap", "character maps to <undefined>",
6992 &starts, &e, &startinpos, &endinpos, &exc, &s,
6993 &v, &outpos, &p)) {
6994 goto onError;
6995 }
6996 continue;
6997 }
6998 *p++ = x;
6999 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007000 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007001 }
7002 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 while (s < e) {
7004 unsigned char ch = *s;
7005 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007006
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7008 w = PyLong_FromLong((long)ch);
7009 if (w == NULL)
7010 goto onError;
7011 x = PyObject_GetItem(mapping, w);
7012 Py_DECREF(w);
7013 if (x == NULL) {
7014 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7015 /* No mapping found means: mapping is undefined. */
7016 PyErr_Clear();
7017 x = Py_None;
7018 Py_INCREF(x);
7019 } else
7020 goto onError;
7021 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007022
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 /* Apply mapping */
7024 if (PyLong_Check(x)) {
7025 long value = PyLong_AS_LONG(x);
7026 if (value < 0 || value > 65535) {
7027 PyErr_SetString(PyExc_TypeError,
7028 "character mapping must be in range(65536)");
7029 Py_DECREF(x);
7030 goto onError;
7031 }
7032 *p++ = (Py_UNICODE)value;
7033 }
7034 else if (x == Py_None) {
7035 /* undefined mapping */
7036 outpos = p-PyUnicode_AS_UNICODE(v);
7037 startinpos = s-starts;
7038 endinpos = startinpos+1;
7039 if (unicode_decode_call_errorhandler(
7040 errors, &errorHandler,
7041 "charmap", "character maps to <undefined>",
7042 &starts, &e, &startinpos, &endinpos, &exc, &s,
7043 &v, &outpos, &p)) {
7044 Py_DECREF(x);
7045 goto onError;
7046 }
7047 Py_DECREF(x);
7048 continue;
7049 }
7050 else if (PyUnicode_Check(x)) {
7051 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007052
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 if (targetsize == 1)
7054 /* 1-1 mapping */
7055 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007056
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 else if (targetsize > 1) {
7058 /* 1-n mapping */
7059 if (targetsize > extrachars) {
7060 /* resize first */
7061 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7062 Py_ssize_t needed = (targetsize - extrachars) + \
7063 (targetsize << 2);
7064 extrachars += needed;
7065 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007066 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 PyUnicode_GET_SIZE(v) + needed) < 0) {
7068 Py_DECREF(x);
7069 goto onError;
7070 }
7071 p = PyUnicode_AS_UNICODE(v) + oldpos;
7072 }
7073 Py_UNICODE_COPY(p,
7074 PyUnicode_AS_UNICODE(x),
7075 targetsize);
7076 p += targetsize;
7077 extrachars -= targetsize;
7078 }
7079 /* 1-0 mapping: skip the character */
7080 }
7081 else {
7082 /* wrong return value */
7083 PyErr_SetString(PyExc_TypeError,
7084 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007085 Py_DECREF(x);
7086 goto onError;
7087 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 Py_DECREF(x);
7089 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 }
7092 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007093 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007095 Py_XDECREF(errorHandler);
7096 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007097#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007098 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007099 Py_DECREF(v);
7100 return NULL;
7101 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007102#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007104
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007106 Py_XDECREF(errorHandler);
7107 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 Py_XDECREF(v);
7109 return NULL;
7110}
7111
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007112/* Charmap encoding: the lookup table */
7113
Alexander Belopolsky40018472011-02-26 01:02:56 +00007114struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 PyObject_HEAD
7116 unsigned char level1[32];
7117 int count2, count3;
7118 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007119};
7120
7121static PyObject*
7122encoding_map_size(PyObject *obj, PyObject* args)
7123{
7124 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007125 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007127}
7128
7129static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007130 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 PyDoc_STR("Return the size (in bytes) of this object") },
7132 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007133};
7134
7135static void
7136encoding_map_dealloc(PyObject* o)
7137{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007138 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007139}
7140
7141static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007142 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 "EncodingMap", /*tp_name*/
7144 sizeof(struct encoding_map), /*tp_basicsize*/
7145 0, /*tp_itemsize*/
7146 /* methods */
7147 encoding_map_dealloc, /*tp_dealloc*/
7148 0, /*tp_print*/
7149 0, /*tp_getattr*/
7150 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007151 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 0, /*tp_repr*/
7153 0, /*tp_as_number*/
7154 0, /*tp_as_sequence*/
7155 0, /*tp_as_mapping*/
7156 0, /*tp_hash*/
7157 0, /*tp_call*/
7158 0, /*tp_str*/
7159 0, /*tp_getattro*/
7160 0, /*tp_setattro*/
7161 0, /*tp_as_buffer*/
7162 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7163 0, /*tp_doc*/
7164 0, /*tp_traverse*/
7165 0, /*tp_clear*/
7166 0, /*tp_richcompare*/
7167 0, /*tp_weaklistoffset*/
7168 0, /*tp_iter*/
7169 0, /*tp_iternext*/
7170 encoding_map_methods, /*tp_methods*/
7171 0, /*tp_members*/
7172 0, /*tp_getset*/
7173 0, /*tp_base*/
7174 0, /*tp_dict*/
7175 0, /*tp_descr_get*/
7176 0, /*tp_descr_set*/
7177 0, /*tp_dictoffset*/
7178 0, /*tp_init*/
7179 0, /*tp_alloc*/
7180 0, /*tp_new*/
7181 0, /*tp_free*/
7182 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007183};
7184
7185PyObject*
7186PyUnicode_BuildEncodingMap(PyObject* string)
7187{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007188 PyObject *result;
7189 struct encoding_map *mresult;
7190 int i;
7191 int need_dict = 0;
7192 unsigned char level1[32];
7193 unsigned char level2[512];
7194 unsigned char *mlevel1, *mlevel2, *mlevel3;
7195 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007196 int kind;
7197 void *data;
7198 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007200 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007201 PyErr_BadArgument();
7202 return NULL;
7203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007204 kind = PyUnicode_KIND(string);
7205 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007206 memset(level1, 0xFF, sizeof level1);
7207 memset(level2, 0xFF, sizeof level2);
7208
7209 /* If there isn't a one-to-one mapping of NULL to \0,
7210 or if there are non-BMP characters, we need to use
7211 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007212 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007213 need_dict = 1;
7214 for (i = 1; i < 256; i++) {
7215 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007216 ch = PyUnicode_READ(kind, data, i);
7217 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007218 need_dict = 1;
7219 break;
7220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007221 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007222 /* unmapped character */
7223 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224 l1 = ch >> 11;
7225 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007226 if (level1[l1] == 0xFF)
7227 level1[l1] = count2++;
7228 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007229 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007230 }
7231
7232 if (count2 >= 0xFF || count3 >= 0xFF)
7233 need_dict = 1;
7234
7235 if (need_dict) {
7236 PyObject *result = PyDict_New();
7237 PyObject *key, *value;
7238 if (!result)
7239 return NULL;
7240 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007241 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007242 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007243 if (!key || !value)
7244 goto failed1;
7245 if (PyDict_SetItem(result, key, value) == -1)
7246 goto failed1;
7247 Py_DECREF(key);
7248 Py_DECREF(value);
7249 }
7250 return result;
7251 failed1:
7252 Py_XDECREF(key);
7253 Py_XDECREF(value);
7254 Py_DECREF(result);
7255 return NULL;
7256 }
7257
7258 /* Create a three-level trie */
7259 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7260 16*count2 + 128*count3 - 1);
7261 if (!result)
7262 return PyErr_NoMemory();
7263 PyObject_Init(result, &EncodingMapType);
7264 mresult = (struct encoding_map*)result;
7265 mresult->count2 = count2;
7266 mresult->count3 = count3;
7267 mlevel1 = mresult->level1;
7268 mlevel2 = mresult->level23;
7269 mlevel3 = mresult->level23 + 16*count2;
7270 memcpy(mlevel1, level1, 32);
7271 memset(mlevel2, 0xFF, 16*count2);
7272 memset(mlevel3, 0, 128*count3);
7273 count3 = 0;
7274 for (i = 1; i < 256; i++) {
7275 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007276 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007277 /* unmapped character */
7278 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007279 o1 = PyUnicode_READ(kind, data, i)>>11;
7280 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007281 i2 = 16*mlevel1[o1] + o2;
7282 if (mlevel2[i2] == 0xFF)
7283 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007284 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007285 i3 = 128*mlevel2[i2] + o3;
7286 mlevel3[i3] = i;
7287 }
7288 return result;
7289}
7290
7291static int
7292encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7293{
7294 struct encoding_map *map = (struct encoding_map*)mapping;
7295 int l1 = c>>11;
7296 int l2 = (c>>7) & 0xF;
7297 int l3 = c & 0x7F;
7298 int i;
7299
7300#ifdef Py_UNICODE_WIDE
7301 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007302 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007303 }
7304#endif
7305 if (c == 0)
7306 return 0;
7307 /* level 1*/
7308 i = map->level1[l1];
7309 if (i == 0xFF) {
7310 return -1;
7311 }
7312 /* level 2*/
7313 i = map->level23[16*i+l2];
7314 if (i == 0xFF) {
7315 return -1;
7316 }
7317 /* level 3 */
7318 i = map->level23[16*map->count2 + 128*i + l3];
7319 if (i == 0) {
7320 return -1;
7321 }
7322 return i;
7323}
7324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007325/* Lookup the character ch in the mapping. If the character
7326 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007327 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007328static PyObject *
7329charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330{
Christian Heimes217cfd12007-12-02 14:31:20 +00007331 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007332 PyObject *x;
7333
7334 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007336 x = PyObject_GetItem(mapping, w);
7337 Py_DECREF(w);
7338 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7340 /* No mapping found means: mapping is undefined. */
7341 PyErr_Clear();
7342 x = Py_None;
7343 Py_INCREF(x);
7344 return x;
7345 } else
7346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007348 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007350 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 long value = PyLong_AS_LONG(x);
7352 if (value < 0 || value > 255) {
7353 PyErr_SetString(PyExc_TypeError,
7354 "character mapping must be in range(256)");
7355 Py_DECREF(x);
7356 return NULL;
7357 }
7358 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007360 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 /* wrong return value */
7364 PyErr_Format(PyExc_TypeError,
7365 "character mapping must return integer, bytes or None, not %.400s",
7366 x->ob_type->tp_name);
7367 Py_DECREF(x);
7368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 }
7370}
7371
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007372static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007373charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007374{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007375 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7376 /* exponentially overallocate to minimize reallocations */
7377 if (requiredsize < 2*outsize)
7378 requiredsize = 2*outsize;
7379 if (_PyBytes_Resize(outobj, requiredsize))
7380 return -1;
7381 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007382}
7383
Benjamin Peterson14339b62009-01-31 16:36:08 +00007384typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007386} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007387/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007388 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 space is available. Return a new reference to the object that
7390 was put in the output buffer, or Py_None, if the mapping was undefined
7391 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007392 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007393static charmapencode_result
7394charmapencode_output(Py_UNICODE c, PyObject *mapping,
7395 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007397 PyObject *rep;
7398 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007399 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007400
Christian Heimes90aa7642007-12-19 02:45:37 +00007401 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007402 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007404 if (res == -1)
7405 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 if (outsize<requiredsize)
7407 if (charmapencode_resize(outobj, outpos, requiredsize))
7408 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007409 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 outstart[(*outpos)++] = (char)res;
7411 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007412 }
7413
7414 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007415 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007417 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 Py_DECREF(rep);
7419 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007420 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 if (PyLong_Check(rep)) {
7422 Py_ssize_t requiredsize = *outpos+1;
7423 if (outsize<requiredsize)
7424 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7425 Py_DECREF(rep);
7426 return enc_EXCEPTION;
7427 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007428 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007430 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 else {
7432 const char *repchars = PyBytes_AS_STRING(rep);
7433 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7434 Py_ssize_t requiredsize = *outpos+repsize;
7435 if (outsize<requiredsize)
7436 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7437 Py_DECREF(rep);
7438 return enc_EXCEPTION;
7439 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007440 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 memcpy(outstart + *outpos, repchars, repsize);
7442 *outpos += repsize;
7443 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007444 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007445 Py_DECREF(rep);
7446 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007447}
7448
7449/* handle an error in PyUnicode_EncodeCharmap
7450 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007451static int
7452charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007453 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007454 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007455 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007456 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007457{
7458 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007459 Py_ssize_t repsize;
7460 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007461 Py_UNICODE *uni2;
7462 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007463 Py_ssize_t collstartpos = *inpos;
7464 Py_ssize_t collendpos = *inpos+1;
7465 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007466 char *encoding = "charmap";
7467 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007468 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007470 /* find all unencodable characters */
7471 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007472 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007473 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 int res = encoding_map_lookup(p[collendpos], mapping);
7475 if (res != -1)
7476 break;
7477 ++collendpos;
7478 continue;
7479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 rep = charmapencode_lookup(p[collendpos], mapping);
7482 if (rep==NULL)
7483 return -1;
7484 else if (rep!=Py_None) {
7485 Py_DECREF(rep);
7486 break;
7487 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007488 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007490 }
7491 /* cache callback name lookup
7492 * (if not done yet, i.e. it's the first error) */
7493 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 if ((errors==NULL) || (!strcmp(errors, "strict")))
7495 *known_errorHandler = 1;
7496 else if (!strcmp(errors, "replace"))
7497 *known_errorHandler = 2;
7498 else if (!strcmp(errors, "ignore"))
7499 *known_errorHandler = 3;
7500 else if (!strcmp(errors, "xmlcharrefreplace"))
7501 *known_errorHandler = 4;
7502 else
7503 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007504 }
7505 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007506 case 1: /* strict */
7507 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7508 return -1;
7509 case 2: /* replace */
7510 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 x = charmapencode_output('?', mapping, res, respos);
7512 if (x==enc_EXCEPTION) {
7513 return -1;
7514 }
7515 else if (x==enc_FAILED) {
7516 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7517 return -1;
7518 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007519 }
7520 /* fall through */
7521 case 3: /* ignore */
7522 *inpos = collendpos;
7523 break;
7524 case 4: /* xmlcharrefreplace */
7525 /* generate replacement (temporarily (mis)uses p) */
7526 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 char buffer[2+29+1+1];
7528 char *cp;
7529 sprintf(buffer, "&#%d;", (int)p[collpos]);
7530 for (cp = buffer; *cp; ++cp) {
7531 x = charmapencode_output(*cp, mapping, res, respos);
7532 if (x==enc_EXCEPTION)
7533 return -1;
7534 else if (x==enc_FAILED) {
7535 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7536 return -1;
7537 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007538 }
7539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007540 *inpos = collendpos;
7541 break;
7542 default:
7543 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 encoding, reason, p, size, exceptionObject,
7545 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007546 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007548 if (PyBytes_Check(repunicode)) {
7549 /* Directly copy bytes result to output. */
7550 Py_ssize_t outsize = PyBytes_Size(*res);
7551 Py_ssize_t requiredsize;
7552 repsize = PyBytes_Size(repunicode);
7553 requiredsize = *respos + repsize;
7554 if (requiredsize > outsize)
7555 /* Make room for all additional bytes. */
7556 if (charmapencode_resize(res, respos, requiredsize)) {
7557 Py_DECREF(repunicode);
7558 return -1;
7559 }
7560 memcpy(PyBytes_AsString(*res) + *respos,
7561 PyBytes_AsString(repunicode), repsize);
7562 *respos += repsize;
7563 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007564 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007565 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007566 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007567 /* generate replacement */
7568 repsize = PyUnicode_GET_SIZE(repunicode);
7569 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 x = charmapencode_output(*uni2, mapping, res, respos);
7571 if (x==enc_EXCEPTION) {
7572 return -1;
7573 }
7574 else if (x==enc_FAILED) {
7575 Py_DECREF(repunicode);
7576 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7577 return -1;
7578 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007579 }
7580 *inpos = newpos;
7581 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007582 }
7583 return 0;
7584}
7585
Alexander Belopolsky40018472011-02-26 01:02:56 +00007586PyObject *
7587PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7588 Py_ssize_t size,
7589 PyObject *mapping,
7590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 /* output object */
7593 PyObject *res = NULL;
7594 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007595 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007596 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007597 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 PyObject *errorHandler = NULL;
7599 PyObject *exc = NULL;
7600 /* the following variable is used for caching string comparisons
7601 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7602 * 3=ignore, 4=xmlcharrefreplace */
7603 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
7605 /* Default to Latin-1 */
7606 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007609 /* allocate enough for a simple encoding without
7610 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007611 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612 if (res == NULL)
7613 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007614 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007617 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 /* try to encode it */
7619 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7620 if (x==enc_EXCEPTION) /* error */
7621 goto onError;
7622 if (x==enc_FAILED) { /* unencodable character */
7623 if (charmap_encoding_error(p, size, &inpos, mapping,
7624 &exc,
7625 &known_errorHandler, &errorHandler, errors,
7626 &res, &respos)) {
7627 goto onError;
7628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 else
7631 /* done with this character => adjust input position */
7632 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007635 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007636 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007637 if (_PyBytes_Resize(&res, respos) < 0)
7638 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007640 Py_XDECREF(exc);
7641 Py_XDECREF(errorHandler);
7642 return res;
7643
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645 Py_XDECREF(res);
7646 Py_XDECREF(exc);
7647 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 return NULL;
7649}
7650
Alexander Belopolsky40018472011-02-26 01:02:56 +00007651PyObject *
7652PyUnicode_AsCharmapString(PyObject *unicode,
7653 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654{
7655 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 PyErr_BadArgument();
7657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 }
7659 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 PyUnicode_GET_SIZE(unicode),
7661 mapping,
7662 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663}
7664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007666static void
7667make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007668 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007669 Py_ssize_t startpos, Py_ssize_t endpos,
7670 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007672 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007673 *exceptionObject = _PyUnicodeTranslateError_Create(
7674 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 }
7676 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7678 goto onError;
7679 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7680 goto onError;
7681 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7682 goto onError;
7683 return;
7684 onError:
7685 Py_DECREF(*exceptionObject);
7686 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687 }
7688}
7689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007690/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007691static void
7692raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007693 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007694 Py_ssize_t startpos, Py_ssize_t endpos,
7695 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007696{
7697 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007698 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007699 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007701}
7702
7703/* error handling callback helper:
7704 build arguments, call the callback and check the arguments,
7705 put the result into newpos and return the replacement string, which
7706 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007707static PyObject *
7708unicode_translate_call_errorhandler(const char *errors,
7709 PyObject **errorHandler,
7710 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007711 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007712 Py_ssize_t startpos, Py_ssize_t endpos,
7713 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007715 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007716
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007717 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007718 PyObject *restuple;
7719 PyObject *resunicode;
7720
7721 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007723 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 }
7726
7727 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007729 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731
7732 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007734 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007737 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 Py_DECREF(restuple);
7739 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007740 }
7741 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 &resunicode, &i_newpos)) {
7743 Py_DECREF(restuple);
7744 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007745 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007746 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007747 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007748 else
7749 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007750 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7752 Py_DECREF(restuple);
7753 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007754 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007755 Py_INCREF(resunicode);
7756 Py_DECREF(restuple);
7757 return resunicode;
7758}
7759
7760/* Lookup the character ch in the mapping and put the result in result,
7761 which must be decrefed by the caller.
7762 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007763static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007764charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007765{
Christian Heimes217cfd12007-12-02 14:31:20 +00007766 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007767 PyObject *x;
7768
7769 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007771 x = PyObject_GetItem(mapping, w);
7772 Py_DECREF(w);
7773 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7775 /* No mapping found means: use 1:1 mapping. */
7776 PyErr_Clear();
7777 *result = NULL;
7778 return 0;
7779 } else
7780 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007781 }
7782 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 *result = x;
7784 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007785 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007786 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 long value = PyLong_AS_LONG(x);
7788 long max = PyUnicode_GetMax();
7789 if (value < 0 || value > max) {
7790 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007791 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 Py_DECREF(x);
7793 return -1;
7794 }
7795 *result = x;
7796 return 0;
7797 }
7798 else if (PyUnicode_Check(x)) {
7799 *result = x;
7800 return 0;
7801 }
7802 else {
7803 /* wrong return value */
7804 PyErr_SetString(PyExc_TypeError,
7805 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007806 Py_DECREF(x);
7807 return -1;
7808 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007809}
7810/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 if not reallocate and adjust various state variables.
7812 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007813static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007814charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007817 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007818 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 /* exponentially overallocate to minimize reallocations */
7820 if (requiredsize < 2 * oldsize)
7821 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007822 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7823 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007825 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826 }
7827 return 0;
7828}
7829/* lookup the character, put the result in the output string and adjust
7830 various state variables. Return a new reference to the object that
7831 was put in the output buffer in *result, or Py_None, if the mapping was
7832 undefined (in which case no character was written).
7833 The called must decref result.
7834 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007835static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7837 PyObject *mapping, Py_UCS4 **output,
7838 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007839 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007841 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7842 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007844 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007847 }
7848 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007850 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007852 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007853 }
7854 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 Py_ssize_t repsize;
7856 if (PyUnicode_READY(*res) == -1)
7857 return -1;
7858 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 if (repsize==1) {
7860 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 }
7863 else if (repsize!=0) {
7864 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 Py_ssize_t requiredsize = *opos +
7866 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 Py_ssize_t i;
7869 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 for(i = 0; i < repsize; i++)
7872 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007874 }
7875 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007877 return 0;
7878}
7879
Alexander Belopolsky40018472011-02-26 01:02:56 +00007880PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881_PyUnicode_TranslateCharmap(PyObject *input,
7882 PyObject *mapping,
7883 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007885 /* input object */
7886 char *idata;
7887 Py_ssize_t size, i;
7888 int kind;
7889 /* output buffer */
7890 Py_UCS4 *output = NULL;
7891 Py_ssize_t osize;
7892 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007893 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007894 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007895 char *reason = "character maps to <undefined>";
7896 PyObject *errorHandler = NULL;
7897 PyObject *exc = NULL;
7898 /* the following variable is used for caching string comparisons
7899 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7900 * 3=ignore, 4=xmlcharrefreplace */
7901 int known_errorHandler = -1;
7902
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 PyErr_BadArgument();
7905 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 if (PyUnicode_READY(input) == -1)
7909 return NULL;
7910 idata = (char*)PyUnicode_DATA(input);
7911 kind = PyUnicode_KIND(input);
7912 size = PyUnicode_GET_LENGTH(input);
7913 i = 0;
7914
7915 if (size == 0) {
7916 Py_INCREF(input);
7917 return input;
7918 }
7919
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007920 /* allocate enough for a simple 1:1 translation without
7921 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007922 osize = size;
7923 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7924 opos = 0;
7925 if (output == NULL) {
7926 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 /* try to encode it */
7932 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007933 if (charmaptranslate_output(input, i, mapping,
7934 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 Py_XDECREF(x);
7936 goto onError;
7937 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007938 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007940 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 else { /* untranslatable character */
7942 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7943 Py_ssize_t repsize;
7944 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007945 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007947 Py_ssize_t collstart = i;
7948 Py_ssize_t collend = i+1;
7949 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007952 while (collend < size) {
7953 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 goto onError;
7955 Py_XDECREF(x);
7956 if (x!=Py_None)
7957 break;
7958 ++collend;
7959 }
7960 /* cache callback name lookup
7961 * (if not done yet, i.e. it's the first error) */
7962 if (known_errorHandler==-1) {
7963 if ((errors==NULL) || (!strcmp(errors, "strict")))
7964 known_errorHandler = 1;
7965 else if (!strcmp(errors, "replace"))
7966 known_errorHandler = 2;
7967 else if (!strcmp(errors, "ignore"))
7968 known_errorHandler = 3;
7969 else if (!strcmp(errors, "xmlcharrefreplace"))
7970 known_errorHandler = 4;
7971 else
7972 known_errorHandler = 0;
7973 }
7974 switch (known_errorHandler) {
7975 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 raise_translate_exception(&exc, input, collstart,
7977 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007978 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 case 2: /* replace */
7980 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 for (coll = collstart; coll<collend; coll++)
7982 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 /* fall through */
7984 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007985 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 break;
7987 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 /* generate replacement (temporarily (mis)uses i) */
7989 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 char buffer[2+29+1+1];
7991 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007992 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7993 if (charmaptranslate_makespace(&output, &osize,
7994 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 goto onError;
7996 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007997 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007999 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 break;
8001 default:
8002 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008003 reason, input, &exc,
8004 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008005 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 goto onError;
8007 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008008 repsize = PyUnicode_GET_LENGTH(repunicode);
8009 if (charmaptranslate_makespace(&output, &osize,
8010 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 Py_DECREF(repunicode);
8012 goto onError;
8013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008014 for (uni2 = 0; repsize-->0; ++uni2)
8015 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8016 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008018 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 }
8020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008021 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8022 if (!res)
8023 goto onError;
8024 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025 Py_XDECREF(exc);
8026 Py_XDECREF(errorHandler);
8027 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008030 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008031 Py_XDECREF(exc);
8032 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 return NULL;
8034}
8035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036/* Deprecated. Use PyUnicode_Translate instead. */
8037PyObject *
8038PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8039 Py_ssize_t size,
8040 PyObject *mapping,
8041 const char *errors)
8042{
8043 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8044 if (!unicode)
8045 return NULL;
8046 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8047}
8048
Alexander Belopolsky40018472011-02-26 01:02:56 +00008049PyObject *
8050PyUnicode_Translate(PyObject *str,
8051 PyObject *mapping,
8052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053{
8054 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008055
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 str = PyUnicode_FromObject(str);
8057 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008059 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 Py_DECREF(str);
8061 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008062
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 Py_XDECREF(str);
8065 return NULL;
8066}
Tim Petersced69f82003-09-16 20:30:58 +00008067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008068static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008069fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070{
8071 /* No need to call PyUnicode_READY(self) because this function is only
8072 called as a callback from fixup() which does it already. */
8073 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8074 const int kind = PyUnicode_KIND(self);
8075 void *data = PyUnicode_DATA(self);
8076 Py_UCS4 maxchar = 0, ch, fixed;
8077 Py_ssize_t i;
8078
8079 for (i = 0; i < len; ++i) {
8080 ch = PyUnicode_READ(kind, data, i);
8081 fixed = 0;
8082 if (ch > 127) {
8083 if (Py_UNICODE_ISSPACE(ch))
8084 fixed = ' ';
8085 else {
8086 const int decimal = Py_UNICODE_TODECIMAL(ch);
8087 if (decimal >= 0)
8088 fixed = '0' + decimal;
8089 }
8090 if (fixed != 0) {
8091 if (fixed > maxchar)
8092 maxchar = fixed;
8093 PyUnicode_WRITE(kind, data, i, fixed);
8094 }
8095 else if (ch > maxchar)
8096 maxchar = ch;
8097 }
8098 else if (ch > maxchar)
8099 maxchar = ch;
8100 }
8101
8102 return maxchar;
8103}
8104
8105PyObject *
8106_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8107{
8108 if (!PyUnicode_Check(unicode)) {
8109 PyErr_BadInternalCall();
8110 return NULL;
8111 }
8112 if (PyUnicode_READY(unicode) == -1)
8113 return NULL;
8114 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8115 /* If the string is already ASCII, just return the same string */
8116 Py_INCREF(unicode);
8117 return unicode;
8118 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008119 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120}
8121
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008122PyObject *
8123PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8124 Py_ssize_t length)
8125{
8126 PyObject *result;
8127 Py_UNICODE *p; /* write pointer into result */
8128 Py_ssize_t i;
8129 /* Copy to a new string */
8130 result = (PyObject *)_PyUnicode_New(length);
8131 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8132 if (result == NULL)
8133 return result;
8134 p = PyUnicode_AS_UNICODE(result);
8135 /* Iterate over code points */
8136 for (i = 0; i < length; i++) {
8137 Py_UNICODE ch =s[i];
8138 if (ch > 127) {
8139 int decimal = Py_UNICODE_TODECIMAL(ch);
8140 if (decimal >= 0)
8141 p[i] = '0' + decimal;
8142 }
8143 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008144#ifndef DONT_MAKE_RESULT_READY
8145 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 Py_DECREF(result);
8147 return NULL;
8148 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008149#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008150 return result;
8151}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008152/* --- Decimal Encoder ---------------------------------------------------- */
8153
Alexander Belopolsky40018472011-02-26 01:02:56 +00008154int
8155PyUnicode_EncodeDecimal(Py_UNICODE *s,
8156 Py_ssize_t length,
8157 char *output,
8158 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008159{
8160 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008161 PyObject *errorHandler = NULL;
8162 PyObject *exc = NULL;
8163 const char *encoding = "decimal";
8164 const char *reason = "invalid decimal Unicode string";
8165 /* the following variable is used for caching string comparisons
8166 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8167 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008168
8169 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 PyErr_BadArgument();
8171 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008172 }
8173
8174 p = s;
8175 end = s + length;
8176 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 register Py_UNICODE ch = *p;
8178 int decimal;
8179 PyObject *repunicode;
8180 Py_ssize_t repsize;
8181 Py_ssize_t newpos;
8182 Py_UNICODE *uni2;
8183 Py_UNICODE *collstart;
8184 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008185
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 ++p;
8189 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008190 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 decimal = Py_UNICODE_TODECIMAL(ch);
8192 if (decimal >= 0) {
8193 *output++ = '0' + decimal;
8194 ++p;
8195 continue;
8196 }
8197 if (0 < ch && ch < 256) {
8198 *output++ = (char)ch;
8199 ++p;
8200 continue;
8201 }
8202 /* All other characters are considered unencodable */
8203 collstart = p;
8204 collend = p+1;
8205 while (collend < end) {
8206 if ((0 < *collend && *collend < 256) ||
8207 !Py_UNICODE_ISSPACE(*collend) ||
8208 Py_UNICODE_TODECIMAL(*collend))
8209 break;
8210 }
8211 /* cache callback name lookup
8212 * (if not done yet, i.e. it's the first error) */
8213 if (known_errorHandler==-1) {
8214 if ((errors==NULL) || (!strcmp(errors, "strict")))
8215 known_errorHandler = 1;
8216 else if (!strcmp(errors, "replace"))
8217 known_errorHandler = 2;
8218 else if (!strcmp(errors, "ignore"))
8219 known_errorHandler = 3;
8220 else if (!strcmp(errors, "xmlcharrefreplace"))
8221 known_errorHandler = 4;
8222 else
8223 known_errorHandler = 0;
8224 }
8225 switch (known_errorHandler) {
8226 case 1: /* strict */
8227 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8228 goto onError;
8229 case 2: /* replace */
8230 for (p = collstart; p < collend; ++p)
8231 *output++ = '?';
8232 /* fall through */
8233 case 3: /* ignore */
8234 p = collend;
8235 break;
8236 case 4: /* xmlcharrefreplace */
8237 /* generate replacement (temporarily (mis)uses p) */
8238 for (p = collstart; p < collend; ++p)
8239 output += sprintf(output, "&#%d;", (int)*p);
8240 p = collend;
8241 break;
8242 default:
8243 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8244 encoding, reason, s, length, &exc,
8245 collstart-s, collend-s, &newpos);
8246 if (repunicode == NULL)
8247 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008248 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008249 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008250 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8251 Py_DECREF(repunicode);
8252 goto onError;
8253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 /* generate replacement */
8255 repsize = PyUnicode_GET_SIZE(repunicode);
8256 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8257 Py_UNICODE ch = *uni2;
8258 if (Py_UNICODE_ISSPACE(ch))
8259 *output++ = ' ';
8260 else {
8261 decimal = Py_UNICODE_TODECIMAL(ch);
8262 if (decimal >= 0)
8263 *output++ = '0' + decimal;
8264 else if (0 < ch && ch < 256)
8265 *output++ = (char)ch;
8266 else {
8267 Py_DECREF(repunicode);
8268 raise_encode_exception(&exc, encoding,
8269 s, length, collstart-s, collend-s, reason);
8270 goto onError;
8271 }
8272 }
8273 }
8274 p = s + newpos;
8275 Py_DECREF(repunicode);
8276 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008277 }
8278 /* 0-terminate the output string */
8279 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 Py_XDECREF(exc);
8281 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008282 return 0;
8283
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 Py_XDECREF(exc);
8286 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008287 return -1;
8288}
8289
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290/* --- Helpers ------------------------------------------------------------ */
8291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008292#include "stringlib/ucs1lib.h"
8293#include "stringlib/fastsearch.h"
8294#include "stringlib/partition.h"
8295#include "stringlib/split.h"
8296#include "stringlib/count.h"
8297#include "stringlib/find.h"
8298#include "stringlib/localeutil.h"
8299#include "stringlib/undef.h"
8300
8301#include "stringlib/ucs2lib.h"
8302#include "stringlib/fastsearch.h"
8303#include "stringlib/partition.h"
8304#include "stringlib/split.h"
8305#include "stringlib/count.h"
8306#include "stringlib/find.h"
8307#include "stringlib/localeutil.h"
8308#include "stringlib/undef.h"
8309
8310#include "stringlib/ucs4lib.h"
8311#include "stringlib/fastsearch.h"
8312#include "stringlib/partition.h"
8313#include "stringlib/split.h"
8314#include "stringlib/count.h"
8315#include "stringlib/find.h"
8316#include "stringlib/localeutil.h"
8317#include "stringlib/undef.h"
8318
8319static Py_ssize_t
8320any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8321 const Py_UCS1*, Py_ssize_t,
8322 Py_ssize_t, Py_ssize_t),
8323 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8324 const Py_UCS2*, Py_ssize_t,
8325 Py_ssize_t, Py_ssize_t),
8326 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8327 const Py_UCS4*, Py_ssize_t,
8328 Py_ssize_t, Py_ssize_t),
8329 PyObject* s1, PyObject* s2,
8330 Py_ssize_t start,
8331 Py_ssize_t end)
8332{
8333 int kind1, kind2, kind;
8334 void *buf1, *buf2;
8335 Py_ssize_t len1, len2, result;
8336
8337 kind1 = PyUnicode_KIND(s1);
8338 kind2 = PyUnicode_KIND(s2);
8339 kind = kind1 > kind2 ? kind1 : kind2;
8340 buf1 = PyUnicode_DATA(s1);
8341 buf2 = PyUnicode_DATA(s2);
8342 if (kind1 != kind)
8343 buf1 = _PyUnicode_AsKind(s1, kind);
8344 if (!buf1)
8345 return -2;
8346 if (kind2 != kind)
8347 buf2 = _PyUnicode_AsKind(s2, kind);
8348 if (!buf2) {
8349 if (kind1 != kind) PyMem_Free(buf1);
8350 return -2;
8351 }
8352 len1 = PyUnicode_GET_LENGTH(s1);
8353 len2 = PyUnicode_GET_LENGTH(s2);
8354
8355 switch(kind) {
8356 case PyUnicode_1BYTE_KIND:
8357 result = ucs1(buf1, len1, buf2, len2, start, end);
8358 break;
8359 case PyUnicode_2BYTE_KIND:
8360 result = ucs2(buf1, len1, buf2, len2, start, end);
8361 break;
8362 case PyUnicode_4BYTE_KIND:
8363 result = ucs4(buf1, len1, buf2, len2, start, end);
8364 break;
8365 default:
8366 assert(0); result = -2;
8367 }
8368
8369 if (kind1 != kind)
8370 PyMem_Free(buf1);
8371 if (kind2 != kind)
8372 PyMem_Free(buf2);
8373
8374 return result;
8375}
8376
8377Py_ssize_t
8378_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8379 Py_ssize_t n_buffer,
8380 void *digits, Py_ssize_t n_digits,
8381 Py_ssize_t min_width,
8382 const char *grouping,
8383 const char *thousands_sep)
8384{
8385 switch(kind) {
8386 case PyUnicode_1BYTE_KIND:
8387 return _PyUnicode_ucs1_InsertThousandsGrouping(
8388 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8389 min_width, grouping, thousands_sep);
8390 case PyUnicode_2BYTE_KIND:
8391 return _PyUnicode_ucs2_InsertThousandsGrouping(
8392 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8393 min_width, grouping, thousands_sep);
8394 case PyUnicode_4BYTE_KIND:
8395 return _PyUnicode_ucs4_InsertThousandsGrouping(
8396 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8397 min_width, grouping, thousands_sep);
8398 }
8399 assert(0);
8400 return -1;
8401}
8402
8403
Eric Smith8c663262007-08-25 02:26:07 +00008404#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008405#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008406
Thomas Wouters477c8d52006-05-27 19:21:47 +00008407#include "stringlib/count.h"
8408#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008409
Thomas Wouters477c8d52006-05-27 19:21:47 +00008410/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008411#define ADJUST_INDICES(start, end, len) \
8412 if (end > len) \
8413 end = len; \
8414 else if (end < 0) { \
8415 end += len; \
8416 if (end < 0) \
8417 end = 0; \
8418 } \
8419 if (start < 0) { \
8420 start += len; \
8421 if (start < 0) \
8422 start = 0; \
8423 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008424
Alexander Belopolsky40018472011-02-26 01:02:56 +00008425Py_ssize_t
8426PyUnicode_Count(PyObject *str,
8427 PyObject *substr,
8428 Py_ssize_t start,
8429 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008432 PyUnicodeObject* str_obj;
8433 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 int kind1, kind2, kind;
8435 void *buf1 = NULL, *buf2 = NULL;
8436 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008437
Thomas Wouters477c8d52006-05-27 19:21:47 +00008438 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008441 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008442 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 Py_DECREF(str_obj);
8444 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 }
Tim Petersced69f82003-09-16 20:30:58 +00008446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 kind1 = PyUnicode_KIND(str_obj);
8448 kind2 = PyUnicode_KIND(sub_obj);
8449 kind = kind1 > kind2 ? kind1 : kind2;
8450 buf1 = PyUnicode_DATA(str_obj);
8451 if (kind1 != kind)
8452 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8453 if (!buf1)
8454 goto onError;
8455 buf2 = PyUnicode_DATA(sub_obj);
8456 if (kind2 != kind)
8457 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8458 if (!buf2)
8459 goto onError;
8460 len1 = PyUnicode_GET_LENGTH(str_obj);
8461 len2 = PyUnicode_GET_LENGTH(sub_obj);
8462
8463 ADJUST_INDICES(start, end, len1);
8464 switch(kind) {
8465 case PyUnicode_1BYTE_KIND:
8466 result = ucs1lib_count(
8467 ((Py_UCS1*)buf1) + start, end - start,
8468 buf2, len2, PY_SSIZE_T_MAX
8469 );
8470 break;
8471 case PyUnicode_2BYTE_KIND:
8472 result = ucs2lib_count(
8473 ((Py_UCS2*)buf1) + start, end - start,
8474 buf2, len2, PY_SSIZE_T_MAX
8475 );
8476 break;
8477 case PyUnicode_4BYTE_KIND:
8478 result = ucs4lib_count(
8479 ((Py_UCS4*)buf1) + start, end - start,
8480 buf2, len2, PY_SSIZE_T_MAX
8481 );
8482 break;
8483 default:
8484 assert(0); result = 0;
8485 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008486
8487 Py_DECREF(sub_obj);
8488 Py_DECREF(str_obj);
8489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 if (kind1 != kind)
8491 PyMem_Free(buf1);
8492 if (kind2 != kind)
8493 PyMem_Free(buf2);
8494
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 onError:
8497 Py_DECREF(sub_obj);
8498 Py_DECREF(str_obj);
8499 if (kind1 != kind && buf1)
8500 PyMem_Free(buf1);
8501 if (kind2 != kind && buf2)
8502 PyMem_Free(buf2);
8503 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504}
8505
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506Py_ssize_t
8507PyUnicode_Find(PyObject *str,
8508 PyObject *sub,
8509 Py_ssize_t start,
8510 Py_ssize_t end,
8511 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008513 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008514
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008518 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 Py_DECREF(str);
8521 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 }
Tim Petersced69f82003-09-16 20:30:58 +00008523
Thomas Wouters477c8d52006-05-27 19:21:47 +00008524 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 result = any_find_slice(
8526 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8527 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008528 );
8529 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 result = any_find_slice(
8531 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8532 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008533 );
8534
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008536 Py_DECREF(sub);
8537
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 return result;
8539}
8540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541Py_ssize_t
8542PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8543 Py_ssize_t start, Py_ssize_t end,
8544 int direction)
8545{
8546 char *result;
8547 int kind;
8548 if (PyUnicode_READY(str) == -1)
8549 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008550 if (start < 0 || end < 0) {
8551 PyErr_SetString(PyExc_IndexError, "string index out of range");
8552 return -2;
8553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 if (end > PyUnicode_GET_LENGTH(str))
8555 end = PyUnicode_GET_LENGTH(str);
8556 kind = PyUnicode_KIND(str);
8557 result = findchar(PyUnicode_1BYTE_DATA(str)
8558 + PyUnicode_KIND_SIZE(kind, start),
8559 kind,
8560 end-start, ch, direction);
8561 if (!result)
8562 return -1;
8563 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8564}
8565
Alexander Belopolsky40018472011-02-26 01:02:56 +00008566static int
8567tailmatch(PyUnicodeObject *self,
8568 PyUnicodeObject *substring,
8569 Py_ssize_t start,
8570 Py_ssize_t end,
8571 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 int kind_self;
8574 int kind_sub;
8575 void *data_self;
8576 void *data_sub;
8577 Py_ssize_t offset;
8578 Py_ssize_t i;
8579 Py_ssize_t end_sub;
8580
8581 if (PyUnicode_READY(self) == -1 ||
8582 PyUnicode_READY(substring) == -1)
8583 return 0;
8584
8585 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 return 1;
8587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8589 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 kind_self = PyUnicode_KIND(self);
8594 data_self = PyUnicode_DATA(self);
8595 kind_sub = PyUnicode_KIND(substring);
8596 data_sub = PyUnicode_DATA(substring);
8597 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8598
8599 if (direction > 0)
8600 offset = end;
8601 else
8602 offset = start;
8603
8604 if (PyUnicode_READ(kind_self, data_self, offset) ==
8605 PyUnicode_READ(kind_sub, data_sub, 0) &&
8606 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8607 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8608 /* If both are of the same kind, memcmp is sufficient */
8609 if (kind_self == kind_sub) {
8610 return ! memcmp((char *)data_self +
8611 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8612 data_sub,
8613 PyUnicode_GET_LENGTH(substring) *
8614 PyUnicode_CHARACTER_SIZE(substring));
8615 }
8616 /* otherwise we have to compare each character by first accesing it */
8617 else {
8618 /* We do not need to compare 0 and len(substring)-1 because
8619 the if statement above ensured already that they are equal
8620 when we end up here. */
8621 // TODO: honor direction and do a forward or backwards search
8622 for (i = 1; i < end_sub; ++i) {
8623 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8624 PyUnicode_READ(kind_sub, data_sub, i))
8625 return 0;
8626 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 }
8630
8631 return 0;
8632}
8633
Alexander Belopolsky40018472011-02-26 01:02:56 +00008634Py_ssize_t
8635PyUnicode_Tailmatch(PyObject *str,
8636 PyObject *substr,
8637 Py_ssize_t start,
8638 Py_ssize_t end,
8639 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008641 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008642
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 str = PyUnicode_FromObject(str);
8644 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 substr = PyUnicode_FromObject(substr);
8647 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 Py_DECREF(str);
8649 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 }
Tim Petersced69f82003-09-16 20:30:58 +00008651
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 (PyUnicodeObject *)substr,
8654 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 Py_DECREF(str);
8656 Py_DECREF(substr);
8657 return result;
8658}
8659
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660/* Apply fixfct filter to the Unicode object self and return a
8661 reference to the modified object */
8662
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008664fixup(PyObject *self,
8665 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 PyObject *u;
8668 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 if (PyUnicode_READY(self) == -1)
8671 return NULL;
8672 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8673 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8674 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8679 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 /* fix functions return the new maximum character in a string,
8682 if the kind of the resulting unicode object does not change,
8683 everything is fine. Otherwise we need to change the string kind
8684 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008685 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 if (maxchar_new == 0)
8687 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8688 else if (maxchar_new <= 127)
8689 maxchar_new = 127;
8690 else if (maxchar_new <= 255)
8691 maxchar_new = 255;
8692 else if (maxchar_new <= 65535)
8693 maxchar_new = 65535;
8694 else
8695 maxchar_new = 1114111; /* 0x10ffff */
8696
8697 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 /* fixfct should return TRUE if it modified the buffer. If
8699 FALSE, return a reference to the original buffer instead
8700 (to save space, not time) */
8701 Py_INCREF(self);
8702 Py_DECREF(u);
8703 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 else if (maxchar_new == maxchar_old) {
8706 return u;
8707 }
8708 else {
8709 /* In case the maximum character changed, we need to
8710 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008711 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 if (v == NULL) {
8713 Py_DECREF(u);
8714 return NULL;
8715 }
8716 if (maxchar_new > maxchar_old) {
8717 /* If the maxchar increased so that the kind changed, not all
8718 characters are representable anymore and we need to fix the
8719 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008720 if (PyUnicode_CopyCharacters(v, 0,
8721 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008722 PyUnicode_GET_LENGTH(self)) < 0)
8723 {
8724 Py_DECREF(u);
8725 return NULL;
8726 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008727 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8729 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008730 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008731 if (PyUnicode_CopyCharacters(v, 0,
8732 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008733 PyUnicode_GET_LENGTH(self)) < 0)
8734 {
8735 Py_DECREF(u);
8736 return NULL;
8737 }
8738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739
8740 Py_DECREF(u);
8741 return v;
8742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743}
8744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008746fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 /* No need to call PyUnicode_READY(self) because this function is only
8749 called as a callback from fixup() which does it already. */
8750 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8751 const int kind = PyUnicode_KIND(self);
8752 void *data = PyUnicode_DATA(self);
8753 int touched = 0;
8754 Py_UCS4 maxchar = 0;
8755 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 for (i = 0; i < len; ++i) {
8758 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8759 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8760 if (up != ch) {
8761 if (up > maxchar)
8762 maxchar = up;
8763 PyUnicode_WRITE(kind, data, i, up);
8764 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766 else if (ch > maxchar)
8767 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 }
8769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 if (touched)
8771 return maxchar;
8772 else
8773 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774}
8775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008777fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8780 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8781 const int kind = PyUnicode_KIND(self);
8782 void *data = PyUnicode_DATA(self);
8783 int touched = 0;
8784 Py_UCS4 maxchar = 0;
8785 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 for(i = 0; i < len; ++i) {
8788 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8789 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8790 if (lo != ch) {
8791 if (lo > maxchar)
8792 maxchar = lo;
8793 PyUnicode_WRITE(kind, data, i, lo);
8794 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 else if (ch > maxchar)
8797 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 }
8799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 if (touched)
8801 return maxchar;
8802 else
8803 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804}
8805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008807fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8810 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8811 const int kind = PyUnicode_KIND(self);
8812 void *data = PyUnicode_DATA(self);
8813 int touched = 0;
8814 Py_UCS4 maxchar = 0;
8815 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 for(i = 0; i < len; ++i) {
8818 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8819 Py_UCS4 nu = 0;
8820
8821 if (Py_UNICODE_ISUPPER(ch))
8822 nu = Py_UNICODE_TOLOWER(ch);
8823 else if (Py_UNICODE_ISLOWER(ch))
8824 nu = Py_UNICODE_TOUPPER(ch);
8825
8826 if (nu != 0) {
8827 if (nu > maxchar)
8828 maxchar = nu;
8829 PyUnicode_WRITE(kind, data, i, nu);
8830 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 else if (ch > maxchar)
8833 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 }
8835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 if (touched)
8837 return maxchar;
8838 else
8839 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840}
8841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008843fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8846 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8847 const int kind = PyUnicode_KIND(self);
8848 void *data = PyUnicode_DATA(self);
8849 int touched = 0;
8850 Py_UCS4 maxchar = 0;
8851 Py_ssize_t i = 0;
8852 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008853
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008854 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856
8857 ch = PyUnicode_READ(kind, data, i);
8858 if (!Py_UNICODE_ISUPPER(ch)) {
8859 maxchar = Py_UNICODE_TOUPPER(ch);
8860 PyUnicode_WRITE(kind, data, i, maxchar);
8861 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 ++i;
8864 for(; i < len; ++i) {
8865 ch = PyUnicode_READ(kind, data, i);
8866 if (!Py_UNICODE_ISLOWER(ch)) {
8867 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8868 if (lo > maxchar)
8869 maxchar = lo;
8870 PyUnicode_WRITE(kind, data, i, lo);
8871 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 else if (ch > maxchar)
8874 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876
8877 if (touched)
8878 return maxchar;
8879 else
8880 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881}
8882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008884fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8887 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8888 const int kind = PyUnicode_KIND(self);
8889 void *data = PyUnicode_DATA(self);
8890 Py_UCS4 maxchar = 0;
8891 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 int previous_is_cased;
8893
8894 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 if (len == 1) {
8896 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8897 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8898 if (ti != ch) {
8899 PyUnicode_WRITE(kind, data, i, ti);
8900 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 }
8902 else
8903 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 for(; i < len; ++i) {
8907 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8908 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008909
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 nu = Py_UNICODE_TOTITLE(ch);
8914
8915 if (nu > maxchar)
8916 maxchar = nu;
8917 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008918
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 if (Py_UNICODE_ISLOWER(ch) ||
8920 Py_UNICODE_ISUPPER(ch) ||
8921 Py_UNICODE_ISTITLE(ch))
8922 previous_is_cased = 1;
8923 else
8924 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927}
8928
Tim Peters8ce9f162004-08-27 01:49:32 +00008929PyObject *
8930PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008933 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008935 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008936 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8937 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008938 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 Py_ssize_t sz, i, res_offset;
8940 Py_UCS4 maxchar = 0;
8941 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942
Tim Peters05eba1f2004-08-27 21:32:02 +00008943 fseq = PySequence_Fast(seq, "");
8944 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008945 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008946 }
8947
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008948 /* NOTE: the following code can't call back into Python code,
8949 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008950 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008951
Tim Peters05eba1f2004-08-27 21:32:02 +00008952 seqlen = PySequence_Fast_GET_SIZE(fseq);
8953 /* If empty sequence, return u"". */
8954 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008956 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008957 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008958 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008959 /* If singleton sequence with an exact Unicode, return that. */
8960 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 item = items[0];
8962 if (PyUnicode_CheckExact(item)) {
8963 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 goto Done;
8966 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008967 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008968 else {
8969 /* Set up sep and seplen */
8970 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 /* fall back to a blank space separator */
8972 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008973 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008975 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008976 else {
8977 if (!PyUnicode_Check(separator)) {
8978 PyErr_Format(PyExc_TypeError,
8979 "separator: expected str instance,"
8980 " %.80s found",
8981 Py_TYPE(separator)->tp_name);
8982 goto onError;
8983 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008984 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 goto onError;
8986 sep = separator;
8987 seplen = PyUnicode_GET_LENGTH(separator);
8988 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
Georg Brandl7597add2011-10-05 16:36:47 +02008989 /* inc refcount to keep this code path symmetric with the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 above case of a blank separator */
8991 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008992 }
8993 }
8994
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008995 /* There are at least two things to join, or else we have a subclass
8996 * of str in the sequence.
8997 * Do a pre-pass to figure out the total amount of space we'll
8998 * need (sz), and see whether all argument are strings.
8999 */
9000 sz = 0;
9001 for (i = 0; i < seqlen; i++) {
9002 const Py_ssize_t old_sz = sz;
9003 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 if (!PyUnicode_Check(item)) {
9005 PyErr_Format(PyExc_TypeError,
9006 "sequence item %zd: expected str instance,"
9007 " %.80s found",
9008 i, Py_TYPE(item)->tp_name);
9009 goto onError;
9010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 if (PyUnicode_READY(item) == -1)
9012 goto onError;
9013 sz += PyUnicode_GET_LENGTH(item);
9014 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9015 if (item_maxchar > maxchar)
9016 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009017 if (i != 0)
9018 sz += seplen;
9019 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9020 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009022 goto onError;
9023 }
9024 }
Tim Petersced69f82003-09-16 20:30:58 +00009025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009027 if (res == NULL)
9028 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009029
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009030 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009032 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009033 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009035 if (i && seplen != 0) {
9036 copied = PyUnicode_CopyCharacters(res, res_offset,
9037 sep, 0, seplen);
9038 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009039 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009040#ifdef Py_DEBUG
9041 res_offset += copied;
9042#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009044#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009046 itemlen = PyUnicode_GET_LENGTH(item);
9047 if (itemlen != 0) {
9048 copied = PyUnicode_CopyCharacters(res, res_offset,
9049 item, 0, itemlen);
9050 if (copied < 0)
9051 goto onError;
9052#ifdef Py_DEBUG
9053 res_offset += copied;
9054#else
9055 res_offset += itemlen;
9056#endif
9057 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009060
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009062 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 Py_XDECREF(sep);
9064 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009067 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009069 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070 return NULL;
9071}
9072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073#define FILL(kind, data, value, start, length) \
9074 do { \
9075 Py_ssize_t i_ = 0; \
9076 assert(kind != PyUnicode_WCHAR_KIND); \
9077 switch ((kind)) { \
9078 case PyUnicode_1BYTE_KIND: { \
9079 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9080 memset(to_, (unsigned char)value, length); \
9081 break; \
9082 } \
9083 case PyUnicode_2BYTE_KIND: { \
9084 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9085 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9086 break; \
9087 } \
9088 default: { \
9089 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9090 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9091 break; \
9092 } \
9093 } \
9094 } while (0)
9095
Victor Stinner9310abb2011-10-05 00:59:23 +02009096static PyObject *
9097pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009098 Py_ssize_t left,
9099 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 PyObject *u;
9103 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009104 int kind;
9105 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106
9107 if (left < 0)
9108 left = 0;
9109 if (right < 0)
9110 right = 0;
9111
Tim Peters7a29bd52001-09-12 03:03:31 +00009112 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 Py_INCREF(self);
9114 return self;
9115 }
9116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9118 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009119 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9120 return NULL;
9121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9123 if (fill > maxchar)
9124 maxchar = fill;
9125 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009126 if (!u)
9127 return NULL;
9128
9129 kind = PyUnicode_KIND(u);
9130 data = PyUnicode_DATA(u);
9131 if (left)
9132 FILL(kind, data, fill, 0, left);
9133 if (right)
9134 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009135 if (PyUnicode_CopyCharacters(u, left,
9136 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009137 _PyUnicode_LENGTH(self)) < 0)
9138 {
9139 Py_DECREF(u);
9140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 }
9142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146
Alexander Belopolsky40018472011-02-26 01:02:56 +00009147PyObject *
9148PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151
9152 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 switch(PyUnicode_KIND(string)) {
9157 case PyUnicode_1BYTE_KIND:
9158 list = ucs1lib_splitlines(
9159 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9160 PyUnicode_GET_LENGTH(string), keepends);
9161 break;
9162 case PyUnicode_2BYTE_KIND:
9163 list = ucs2lib_splitlines(
9164 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9165 PyUnicode_GET_LENGTH(string), keepends);
9166 break;
9167 case PyUnicode_4BYTE_KIND:
9168 list = ucs4lib_splitlines(
9169 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9170 PyUnicode_GET_LENGTH(string), keepends);
9171 break;
9172 default:
9173 assert(0);
9174 list = 0;
9175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176 Py_DECREF(string);
9177 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178}
9179
Alexander Belopolsky40018472011-02-26 01:02:56 +00009180static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009181split(PyObject *self,
9182 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009183 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 int kind1, kind2, kind;
9186 void *buf1, *buf2;
9187 Py_ssize_t len1, len2;
9188 PyObject* out;
9189
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009191 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 if (PyUnicode_READY(self) == -1)
9194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 if (substring == NULL)
9197 switch(PyUnicode_KIND(self)) {
9198 case PyUnicode_1BYTE_KIND:
9199 return ucs1lib_split_whitespace(
9200 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9201 PyUnicode_GET_LENGTH(self), maxcount
9202 );
9203 case PyUnicode_2BYTE_KIND:
9204 return ucs2lib_split_whitespace(
9205 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9206 PyUnicode_GET_LENGTH(self), maxcount
9207 );
9208 case PyUnicode_4BYTE_KIND:
9209 return ucs4lib_split_whitespace(
9210 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9211 PyUnicode_GET_LENGTH(self), maxcount
9212 );
9213 default:
9214 assert(0);
9215 return NULL;
9216 }
9217
9218 if (PyUnicode_READY(substring) == -1)
9219 return NULL;
9220
9221 kind1 = PyUnicode_KIND(self);
9222 kind2 = PyUnicode_KIND(substring);
9223 kind = kind1 > kind2 ? kind1 : kind2;
9224 buf1 = PyUnicode_DATA(self);
9225 buf2 = PyUnicode_DATA(substring);
9226 if (kind1 != kind)
9227 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9228 if (!buf1)
9229 return NULL;
9230 if (kind2 != kind)
9231 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9232 if (!buf2) {
9233 if (kind1 != kind) PyMem_Free(buf1);
9234 return NULL;
9235 }
9236 len1 = PyUnicode_GET_LENGTH(self);
9237 len2 = PyUnicode_GET_LENGTH(substring);
9238
9239 switch(kind) {
9240 case PyUnicode_1BYTE_KIND:
9241 out = ucs1lib_split(
9242 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9243 break;
9244 case PyUnicode_2BYTE_KIND:
9245 out = ucs2lib_split(
9246 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9247 break;
9248 case PyUnicode_4BYTE_KIND:
9249 out = ucs4lib_split(
9250 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9251 break;
9252 default:
9253 out = NULL;
9254 }
9255 if (kind1 != kind)
9256 PyMem_Free(buf1);
9257 if (kind2 != kind)
9258 PyMem_Free(buf2);
9259 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260}
9261
Alexander Belopolsky40018472011-02-26 01:02:56 +00009262static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009263rsplit(PyObject *self,
9264 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009265 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 int kind1, kind2, kind;
9268 void *buf1, *buf2;
9269 Py_ssize_t len1, len2;
9270 PyObject* out;
9271
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009272 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009273 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 if (PyUnicode_READY(self) == -1)
9276 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 if (substring == NULL)
9279 switch(PyUnicode_KIND(self)) {
9280 case PyUnicode_1BYTE_KIND:
9281 return ucs1lib_rsplit_whitespace(
9282 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9283 PyUnicode_GET_LENGTH(self), maxcount
9284 );
9285 case PyUnicode_2BYTE_KIND:
9286 return ucs2lib_rsplit_whitespace(
9287 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9288 PyUnicode_GET_LENGTH(self), maxcount
9289 );
9290 case PyUnicode_4BYTE_KIND:
9291 return ucs4lib_rsplit_whitespace(
9292 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9293 PyUnicode_GET_LENGTH(self), maxcount
9294 );
9295 default:
9296 assert(0);
9297 return NULL;
9298 }
9299
9300 if (PyUnicode_READY(substring) == -1)
9301 return NULL;
9302
9303 kind1 = PyUnicode_KIND(self);
9304 kind2 = PyUnicode_KIND(substring);
9305 kind = kind1 > kind2 ? kind1 : kind2;
9306 buf1 = PyUnicode_DATA(self);
9307 buf2 = PyUnicode_DATA(substring);
9308 if (kind1 != kind)
9309 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9310 if (!buf1)
9311 return NULL;
9312 if (kind2 != kind)
9313 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9314 if (!buf2) {
9315 if (kind1 != kind) PyMem_Free(buf1);
9316 return NULL;
9317 }
9318 len1 = PyUnicode_GET_LENGTH(self);
9319 len2 = PyUnicode_GET_LENGTH(substring);
9320
9321 switch(kind) {
9322 case PyUnicode_1BYTE_KIND:
9323 out = ucs1lib_rsplit(
9324 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9325 break;
9326 case PyUnicode_2BYTE_KIND:
9327 out = ucs2lib_rsplit(
9328 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9329 break;
9330 case PyUnicode_4BYTE_KIND:
9331 out = ucs4lib_rsplit(
9332 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9333 break;
9334 default:
9335 out = NULL;
9336 }
9337 if (kind1 != kind)
9338 PyMem_Free(buf1);
9339 if (kind2 != kind)
9340 PyMem_Free(buf2);
9341 return out;
9342}
9343
9344static Py_ssize_t
9345anylib_find(int kind, void *buf1, Py_ssize_t len1,
9346 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9347{
9348 switch(kind) {
9349 case PyUnicode_1BYTE_KIND:
9350 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9351 case PyUnicode_2BYTE_KIND:
9352 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9353 case PyUnicode_4BYTE_KIND:
9354 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9355 }
9356 assert(0);
9357 return -1;
9358}
9359
9360static Py_ssize_t
9361anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9362 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9363{
9364 switch(kind) {
9365 case PyUnicode_1BYTE_KIND:
9366 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9367 case PyUnicode_2BYTE_KIND:
9368 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9369 case PyUnicode_4BYTE_KIND:
9370 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9371 }
9372 assert(0);
9373 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009374}
9375
Alexander Belopolsky40018472011-02-26 01:02:56 +00009376static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377replace(PyObject *self, PyObject *str1,
9378 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 PyObject *u;
9381 char *sbuf = PyUnicode_DATA(self);
9382 char *buf1 = PyUnicode_DATA(str1);
9383 char *buf2 = PyUnicode_DATA(str2);
9384 int srelease = 0, release1 = 0, release2 = 0;
9385 int skind = PyUnicode_KIND(self);
9386 int kind1 = PyUnicode_KIND(str1);
9387 int kind2 = PyUnicode_KIND(str2);
9388 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9389 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9390 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391
9392 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009395 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 if (skind < kind1)
9398 /* substring too wide to be present */
9399 goto nothing;
9400
9401 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009402 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009403 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009405 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009407 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 Py_UCS4 u1, u2, maxchar;
9409 int mayshrink, rkind;
9410 u1 = PyUnicode_READ_CHAR(str1, 0);
9411 if (!findchar(sbuf, PyUnicode_KIND(self),
9412 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009413 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 u2 = PyUnicode_READ_CHAR(str2, 0);
9415 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9416 /* Replacing u1 with u2 may cause a maxchar reduction in the
9417 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 if (u2 > maxchar) {
9419 maxchar = u2;
9420 mayshrink = 0;
9421 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009422 else
9423 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009425 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009427 if (PyUnicode_CopyCharacters(u, 0,
9428 (PyObject*)self, 0, slen) < 0)
9429 {
9430 Py_DECREF(u);
9431 return NULL;
9432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 rkind = PyUnicode_KIND(u);
9434 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9435 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009436 if (--maxcount < 0)
9437 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 if (mayshrink) {
9441 PyObject *tmp = u;
9442 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9443 PyUnicode_GET_LENGTH(tmp));
9444 Py_DECREF(tmp);
9445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 int rkind = skind;
9448 char *res;
9449 if (kind1 < rkind) {
9450 /* widen substring */
9451 buf1 = _PyUnicode_AsKind(str1, rkind);
9452 if (!buf1) goto error;
9453 release1 = 1;
9454 }
9455 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009456 if (i < 0)
9457 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 if (rkind > kind2) {
9459 /* widen replacement */
9460 buf2 = _PyUnicode_AsKind(str2, rkind);
9461 if (!buf2) goto error;
9462 release2 = 1;
9463 }
9464 else if (rkind < kind2) {
9465 /* widen self and buf1 */
9466 rkind = kind2;
9467 if (release1) PyMem_Free(buf1);
9468 sbuf = _PyUnicode_AsKind(self, rkind);
9469 if (!sbuf) goto error;
9470 srelease = 1;
9471 buf1 = _PyUnicode_AsKind(str1, rkind);
9472 if (!buf1) goto error;
9473 release1 = 1;
9474 }
9475 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9476 if (!res) {
9477 PyErr_NoMemory();
9478 goto error;
9479 }
9480 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009481 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9483 buf2,
9484 PyUnicode_KIND_SIZE(rkind, len2));
9485 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009486
9487 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9489 slen-i,
9490 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009491 if (i == -1)
9492 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9494 buf2,
9495 PyUnicode_KIND_SIZE(rkind, len2));
9496 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498
9499 u = PyUnicode_FromKindAndData(rkind, res, slen);
9500 PyMem_Free(res);
9501 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 Py_ssize_t n, i, j, ires;
9506 Py_ssize_t product, new_size;
9507 int rkind = skind;
9508 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 if (kind1 < rkind) {
9511 buf1 = _PyUnicode_AsKind(str1, rkind);
9512 if (!buf1) goto error;
9513 release1 = 1;
9514 }
9515 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009516 if (n == 0)
9517 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 if (kind2 < rkind) {
9519 buf2 = _PyUnicode_AsKind(str2, rkind);
9520 if (!buf2) goto error;
9521 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 else if (kind2 > rkind) {
9524 rkind = kind2;
9525 sbuf = _PyUnicode_AsKind(self, rkind);
9526 if (!sbuf) goto error;
9527 srelease = 1;
9528 if (release1) PyMem_Free(buf1);
9529 buf1 = _PyUnicode_AsKind(str1, rkind);
9530 if (!buf1) goto error;
9531 release1 = 1;
9532 }
9533 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9534 PyUnicode_GET_LENGTH(str1))); */
9535 product = n * (len2-len1);
9536 if ((product / (len2-len1)) != n) {
9537 PyErr_SetString(PyExc_OverflowError,
9538 "replace string is too long");
9539 goto error;
9540 }
9541 new_size = slen + product;
9542 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9543 PyErr_SetString(PyExc_OverflowError,
9544 "replace string is too long");
9545 goto error;
9546 }
9547 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9548 if (!res)
9549 goto error;
9550 ires = i = 0;
9551 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009552 while (n-- > 0) {
9553 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 j = anylib_find(rkind,
9555 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9556 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009557 if (j == -1)
9558 break;
9559 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009560 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9562 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9563 PyUnicode_KIND_SIZE(rkind, j-i));
9564 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009565 }
9566 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 if (len2 > 0) {
9568 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9569 buf2,
9570 PyUnicode_KIND_SIZE(rkind, len2));
9571 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009576 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9578 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9579 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009580 } else {
9581 /* interleave */
9582 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9584 buf2,
9585 PyUnicode_KIND_SIZE(rkind, len2));
9586 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009587 if (--n <= 0)
9588 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9590 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9591 PyUnicode_KIND_SIZE(rkind, 1));
9592 ires++;
9593 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9596 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9597 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009600 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 if (srelease)
9603 PyMem_FREE(sbuf);
9604 if (release1)
9605 PyMem_FREE(buf1);
9606 if (release2)
9607 PyMem_FREE(buf2);
9608 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009609
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009611 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 if (srelease)
9613 PyMem_FREE(sbuf);
9614 if (release1)
9615 PyMem_FREE(buf1);
9616 if (release2)
9617 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009618 if (PyUnicode_CheckExact(self)) {
9619 Py_INCREF(self);
9620 return (PyObject *) self;
9621 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009622 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 error:
9624 if (srelease && sbuf)
9625 PyMem_FREE(sbuf);
9626 if (release1 && buf1)
9627 PyMem_FREE(buf1);
9628 if (release2 && buf2)
9629 PyMem_FREE(buf2);
9630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631}
9632
9633/* --- Unicode Object Methods --------------------------------------------- */
9634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009635PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637\n\
9638Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009639characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640
9641static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009642unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644 return fixup(self, fixtitle);
9645}
9646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009647PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649\n\
9650Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009651have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652
9653static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009654unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656 return fixup(self, fixcapitalize);
9657}
9658
9659#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009660PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662\n\
9663Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009664normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665
9666static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009667unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668{
9669 PyObject *list;
9670 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009671 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673 /* Split into words */
9674 list = split(self, NULL, -1);
9675 if (!list)
9676 return NULL;
9677
9678 /* Capitalize each word */
9679 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9680 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 if (item == NULL)
9683 goto onError;
9684 Py_DECREF(PyList_GET_ITEM(list, i));
9685 PyList_SET_ITEM(list, i, item);
9686 }
9687
9688 /* Join the words to form a new string */
9689 item = PyUnicode_Join(NULL, list);
9690
Benjamin Peterson29060642009-01-31 22:14:21 +00009691 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692 Py_DECREF(list);
9693 return (PyObject *)item;
9694}
9695#endif
9696
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009697/* Argument converter. Coerces to a single unicode character */
9698
9699static int
9700convert_uc(PyObject *obj, void *addr)
9701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009703 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009704
Benjamin Peterson14339b62009-01-31 16:36:08 +00009705 uniobj = PyUnicode_FromObject(obj);
9706 if (uniobj == NULL) {
9707 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009708 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009709 return 0;
9710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009712 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009713 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009714 Py_DECREF(uniobj);
9715 return 0;
9716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009718 Py_DECREF(uniobj);
9719 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009720}
9721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009722PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009725Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009726done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727
9728static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009729unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009731 Py_ssize_t marg, left;
9732 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 Py_UCS4 fillchar = ' ';
9734
Victor Stinnere9a29352011-10-01 02:14:59 +02009735 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737
Victor Stinnere9a29352011-10-01 02:14:59 +02009738 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009739 return NULL;
9740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742 Py_INCREF(self);
9743 return (PyObject*) self;
9744 }
9745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747 left = marg / 2 + (marg & width & 1);
9748
Victor Stinner9310abb2011-10-05 00:59:23 +02009749 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750}
9751
Marc-André Lemburge5034372000-08-08 08:04:29 +00009752#if 0
9753
9754/* This code should go into some future Unicode collation support
9755 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009756 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009757
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009758/* speedy UTF-16 code point order comparison */
9759/* gleaned from: */
9760/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9761
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009762static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009763{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009764 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009765 0, 0, 0, 0, 0, 0, 0, 0,
9766 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009767 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009768};
9769
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770static int
9771unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9772{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009773 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009774
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775 Py_UNICODE *s1 = str1->str;
9776 Py_UNICODE *s2 = str2->str;
9777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 len1 = str1->_base._base.length;
9779 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009780
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009782 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009783
9784 c1 = *s1++;
9785 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009786
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 if (c1 > (1<<11) * 26)
9788 c1 += utf16Fixup[c1>>11];
9789 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009790 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009791 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009792
9793 if (c1 != c2)
9794 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009795
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009796 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797 }
9798
9799 return (len1 < len2) ? -1 : (len1 != len2);
9800}
9801
Marc-André Lemburge5034372000-08-08 08:04:29 +00009802#else
9803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804/* This function assumes that str1 and str2 are readied by the caller. */
9805
Marc-André Lemburge5034372000-08-08 08:04:29 +00009806static int
9807unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 int kind1, kind2;
9810 void *data1, *data2;
9811 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 kind1 = PyUnicode_KIND(str1);
9814 kind2 = PyUnicode_KIND(str2);
9815 data1 = PyUnicode_DATA(str1);
9816 data2 = PyUnicode_DATA(str2);
9817 len1 = PyUnicode_GET_LENGTH(str1);
9818 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 for (i = 0; i < len1 && i < len2; ++i) {
9821 Py_UCS4 c1, c2;
9822 c1 = PyUnicode_READ(kind1, data1, i);
9823 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009824
9825 if (c1 != c2)
9826 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009827 }
9828
9829 return (len1 < len2) ? -1 : (len1 != len2);
9830}
9831
9832#endif
9833
Alexander Belopolsky40018472011-02-26 01:02:56 +00009834int
9835PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9838 if (PyUnicode_READY(left) == -1 ||
9839 PyUnicode_READY(right) == -1)
9840 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009841 return unicode_compare((PyUnicodeObject *)left,
9842 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009844 PyErr_Format(PyExc_TypeError,
9845 "Can't compare %.100s and %.100s",
9846 left->ob_type->tp_name,
9847 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848 return -1;
9849}
9850
Martin v. Löwis5b222132007-06-10 09:51:05 +00009851int
9852PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 Py_ssize_t i;
9855 int kind;
9856 void *data;
9857 Py_UCS4 chr;
9858
Victor Stinner910337b2011-10-03 03:20:16 +02009859 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 if (PyUnicode_READY(uni) == -1)
9861 return -1;
9862 kind = PyUnicode_KIND(uni);
9863 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009864 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9866 if (chr != str[i])
9867 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009868 /* This check keeps Python strings that end in '\0' from comparing equal
9869 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009872 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009873 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009874 return 0;
9875}
9876
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009877
Benjamin Peterson29060642009-01-31 22:14:21 +00009878#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009879 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009880
Alexander Belopolsky40018472011-02-26 01:02:56 +00009881PyObject *
9882PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009883{
9884 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009885
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009886 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9887 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 if (PyUnicode_READY(left) == -1 ||
9889 PyUnicode_READY(right) == -1)
9890 return NULL;
9891 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9892 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009893 if (op == Py_EQ) {
9894 Py_INCREF(Py_False);
9895 return Py_False;
9896 }
9897 if (op == Py_NE) {
9898 Py_INCREF(Py_True);
9899 return Py_True;
9900 }
9901 }
9902 if (left == right)
9903 result = 0;
9904 else
9905 result = unicode_compare((PyUnicodeObject *)left,
9906 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009907
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009908 /* Convert the return value to a Boolean */
9909 switch (op) {
9910 case Py_EQ:
9911 v = TEST_COND(result == 0);
9912 break;
9913 case Py_NE:
9914 v = TEST_COND(result != 0);
9915 break;
9916 case Py_LE:
9917 v = TEST_COND(result <= 0);
9918 break;
9919 case Py_GE:
9920 v = TEST_COND(result >= 0);
9921 break;
9922 case Py_LT:
9923 v = TEST_COND(result == -1);
9924 break;
9925 case Py_GT:
9926 v = TEST_COND(result == 1);
9927 break;
9928 default:
9929 PyErr_BadArgument();
9930 return NULL;
9931 }
9932 Py_INCREF(v);
9933 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009935
Brian Curtindfc80e32011-08-10 20:28:54 -05009936 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009937}
9938
Alexander Belopolsky40018472011-02-26 01:02:56 +00009939int
9940PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009941{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009942 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 int kind1, kind2, kind;
9944 void *buf1, *buf2;
9945 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009946 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009947
9948 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009949 sub = PyUnicode_FromObject(element);
9950 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 PyErr_Format(PyExc_TypeError,
9952 "'in <string>' requires string as left operand, not %s",
9953 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009954 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 if (PyUnicode_READY(sub) == -1)
9957 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009958
Thomas Wouters477c8d52006-05-27 19:21:47 +00009959 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009960 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009961 Py_DECREF(sub);
9962 return -1;
9963 }
9964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 kind1 = PyUnicode_KIND(str);
9966 kind2 = PyUnicode_KIND(sub);
9967 kind = kind1 > kind2 ? kind1 : kind2;
9968 buf1 = PyUnicode_DATA(str);
9969 buf2 = PyUnicode_DATA(sub);
9970 if (kind1 != kind)
9971 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9972 if (!buf1) {
9973 Py_DECREF(sub);
9974 return -1;
9975 }
9976 if (kind2 != kind)
9977 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9978 if (!buf2) {
9979 Py_DECREF(sub);
9980 if (kind1 != kind) PyMem_Free(buf1);
9981 return -1;
9982 }
9983 len1 = PyUnicode_GET_LENGTH(str);
9984 len2 = PyUnicode_GET_LENGTH(sub);
9985
9986 switch(kind) {
9987 case PyUnicode_1BYTE_KIND:
9988 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9989 break;
9990 case PyUnicode_2BYTE_KIND:
9991 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9992 break;
9993 case PyUnicode_4BYTE_KIND:
9994 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9995 break;
9996 default:
9997 result = -1;
9998 assert(0);
9999 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010000
10001 Py_DECREF(str);
10002 Py_DECREF(sub);
10003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (kind1 != kind)
10005 PyMem_Free(buf1);
10006 if (kind2 != kind)
10007 PyMem_Free(buf2);
10008
Guido van Rossum403d68b2000-03-13 15:55:09 +000010009 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010010}
10011
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012/* Concat to string or Unicode object giving a new Unicode object. */
10013
Alexander Belopolsky40018472011-02-26 01:02:56 +000010014PyObject *
10015PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 PyObject *u = NULL, *v = NULL, *w;
10018 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019
10020 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010023 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027
10028 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010029 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010033 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010034 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036 }
10037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010039 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 w = PyUnicode_New(
10043 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10044 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010046 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010047 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10048 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010049 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010050 v, 0,
10051 PyUnicode_GET_LENGTH(v)) < 0)
10052 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053 Py_DECREF(u);
10054 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056
Benjamin Peterson29060642009-01-31 22:14:21 +000010057 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058 Py_XDECREF(u);
10059 Py_XDECREF(v);
10060 return NULL;
10061}
10062
Victor Stinnerb0923652011-10-04 01:17:31 +020010063static void
10064unicode_append_inplace(PyObject **p_left, PyObject *right)
10065{
10066 Py_ssize_t left_len, right_len, new_len;
10067#ifdef Py_DEBUG
10068 Py_ssize_t copied;
10069#endif
10070
10071 assert(PyUnicode_IS_READY(*p_left));
10072 assert(PyUnicode_IS_READY(right));
10073
10074 left_len = PyUnicode_GET_LENGTH(*p_left);
10075 right_len = PyUnicode_GET_LENGTH(right);
10076 if (left_len > PY_SSIZE_T_MAX - right_len) {
10077 PyErr_SetString(PyExc_OverflowError,
10078 "strings are too large to concat");
10079 goto error;
10080 }
10081 new_len = left_len + right_len;
10082
10083 /* Now we own the last reference to 'left', so we can resize it
10084 * in-place.
10085 */
10086 if (unicode_resize(p_left, new_len) != 0) {
10087 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10088 * deallocated so it cannot be put back into
10089 * 'variable'. The MemoryError is raised when there
10090 * is no value in 'variable', which might (very
10091 * remotely) be a cause of incompatibilities.
10092 */
10093 goto error;
10094 }
10095 /* copy 'right' into the newly allocated area of 'left' */
10096#ifdef Py_DEBUG
10097 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10098 right, 0,
10099 right_len);
10100 assert(0 <= copied);
10101#else
10102 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10103#endif
10104 return;
10105
10106error:
10107 Py_DECREF(*p_left);
10108 *p_left = NULL;
10109}
10110
Walter Dörwald1ab83302007-05-18 17:15:44 +000010111void
Victor Stinner23e56682011-10-03 03:54:37 +020010112PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010113{
Victor Stinner23e56682011-10-03 03:54:37 +020010114 PyObject *left, *res;
10115
10116 if (p_left == NULL) {
10117 if (!PyErr_Occurred())
10118 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010119 return;
10120 }
Victor Stinner23e56682011-10-03 03:54:37 +020010121 left = *p_left;
10122 if (right == NULL || !PyUnicode_Check(left)) {
10123 if (!PyErr_Occurred())
10124 PyErr_BadInternalCall();
10125 goto error;
10126 }
10127
Victor Stinnere1335c72011-10-04 20:53:03 +020010128 if (PyUnicode_READY(left))
10129 goto error;
10130 if (PyUnicode_READY(right))
10131 goto error;
10132
Victor Stinner23e56682011-10-03 03:54:37 +020010133 if (PyUnicode_CheckExact(left) && left != unicode_empty
10134 && PyUnicode_CheckExact(right) && right != unicode_empty
10135 && unicode_resizable(left)
10136 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10137 || _PyUnicode_WSTR(left) != NULL))
10138 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010139 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10140 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010141 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010142 not so different than duplicating the string. */
10143 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010144 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010145 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010146 return;
10147 }
10148 }
10149
10150 res = PyUnicode_Concat(left, right);
10151 if (res == NULL)
10152 goto error;
10153 Py_DECREF(left);
10154 *p_left = res;
10155 return;
10156
10157error:
10158 Py_DECREF(*p_left);
10159 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010160}
10161
10162void
10163PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10164{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010165 PyUnicode_Append(pleft, right);
10166 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010167}
10168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010169PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010170 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010172Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010173string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010174interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175
10176static PyObject *
10177unicode_count(PyUnicodeObject *self, PyObject *args)
10178{
10179 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010180 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010181 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 int kind1, kind2, kind;
10184 void *buf1, *buf2;
10185 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186
Jesus Ceaac451502011-04-20 17:09:23 +020010187 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10188 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010189 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 kind1 = PyUnicode_KIND(self);
10192 kind2 = PyUnicode_KIND(substring);
10193 kind = kind1 > kind2 ? kind1 : kind2;
10194 buf1 = PyUnicode_DATA(self);
10195 buf2 = PyUnicode_DATA(substring);
10196 if (kind1 != kind)
10197 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10198 if (!buf1) {
10199 Py_DECREF(substring);
10200 return NULL;
10201 }
10202 if (kind2 != kind)
10203 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10204 if (!buf2) {
10205 Py_DECREF(substring);
10206 if (kind1 != kind) PyMem_Free(buf1);
10207 return NULL;
10208 }
10209 len1 = PyUnicode_GET_LENGTH(self);
10210 len2 = PyUnicode_GET_LENGTH(substring);
10211
10212 ADJUST_INDICES(start, end, len1);
10213 switch(kind) {
10214 case PyUnicode_1BYTE_KIND:
10215 iresult = ucs1lib_count(
10216 ((Py_UCS1*)buf1) + start, end - start,
10217 buf2, len2, PY_SSIZE_T_MAX
10218 );
10219 break;
10220 case PyUnicode_2BYTE_KIND:
10221 iresult = ucs2lib_count(
10222 ((Py_UCS2*)buf1) + start, end - start,
10223 buf2, len2, PY_SSIZE_T_MAX
10224 );
10225 break;
10226 case PyUnicode_4BYTE_KIND:
10227 iresult = ucs4lib_count(
10228 ((Py_UCS4*)buf1) + start, end - start,
10229 buf2, len2, PY_SSIZE_T_MAX
10230 );
10231 break;
10232 default:
10233 assert(0); iresult = 0;
10234 }
10235
10236 result = PyLong_FromSsize_t(iresult);
10237
10238 if (kind1 != kind)
10239 PyMem_Free(buf1);
10240 if (kind2 != kind)
10241 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242
10243 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245 return result;
10246}
10247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010248PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010249 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010251Encode S using the codec registered for encoding. Default encoding\n\
10252is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010253handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010254a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10255'xmlcharrefreplace' as well as any other name registered with\n\
10256codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257
10258static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010259unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010261 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262 char *encoding = NULL;
10263 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010264
Benjamin Peterson308d6372009-09-18 21:42:35 +000010265 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10266 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010268 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010269}
10270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010271PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010272 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273\n\
10274Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010275If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
10277static PyObject*
10278unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10279{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010280 Py_ssize_t i, j, line_pos, src_len, incr;
10281 Py_UCS4 ch;
10282 PyObject *u;
10283 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010285 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010286 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
10288 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290
Antoine Pitrou22425222011-10-04 19:10:51 +020010291 if (PyUnicode_READY(self) == -1)
10292 return NULL;
10293
Thomas Wouters7e474022000-07-16 12:04:32 +000010294 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010295 src_len = PyUnicode_GET_LENGTH(self);
10296 i = j = line_pos = 0;
10297 kind = PyUnicode_KIND(self);
10298 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010299 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010300 for (; i < src_len; i++) {
10301 ch = PyUnicode_READ(kind, src_data, i);
10302 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010303 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010304 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010305 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010306 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010307 goto overflow;
10308 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010310 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010313 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010314 goto overflow;
10315 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010317 if (ch == '\n' || ch == '\r')
10318 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010320 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010321 if (!found && PyUnicode_CheckExact(self)) {
10322 Py_INCREF((PyObject *) self);
10323 return (PyObject *) self;
10324 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010325
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010327 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328 if (!u)
10329 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010330 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331
Antoine Pitroue71d5742011-10-04 15:55:09 +020010332 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333
Antoine Pitroue71d5742011-10-04 15:55:09 +020010334 for (; i < src_len; i++) {
10335 ch = PyUnicode_READ(kind, src_data, i);
10336 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010337 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010338 incr = tabsize - (line_pos % tabsize);
10339 line_pos += incr;
10340 while (incr--) {
10341 PyUnicode_WRITE(kind, dest_data, j, ' ');
10342 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010343 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010345 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010346 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010347 line_pos++;
10348 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010349 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010350 if (ch == '\n' || ch == '\r')
10351 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010353 }
10354 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010355#ifndef DONT_MAKE_RESULT_READY
10356 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 Py_DECREF(u);
10358 return NULL;
10359 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010360#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010362
Antoine Pitroue71d5742011-10-04 15:55:09 +020010363 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010364 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366}
10367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010368PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010369 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370\n\
10371Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010372such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373arguments start and end are interpreted as in slice notation.\n\
10374\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010375Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
10377static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379{
Jesus Ceaac451502011-04-20 17:09:23 +020010380 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010381 Py_ssize_t start;
10382 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010383 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384
Jesus Ceaac451502011-04-20 17:09:23 +020010385 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10386 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 if (PyUnicode_READY(self) == -1)
10390 return NULL;
10391 if (PyUnicode_READY(substring) == -1)
10392 return NULL;
10393
10394 result = any_find_slice(
10395 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10396 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010397 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398
10399 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 if (result == -2)
10402 return NULL;
10403
Christian Heimes217cfd12007-12-02 14:31:20 +000010404 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405}
10406
10407static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010408unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010410 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10411 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414}
10415
Guido van Rossumc2504932007-09-18 19:42:40 +000010416/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010417 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010418static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010419unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420{
Guido van Rossumc2504932007-09-18 19:42:40 +000010421 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010422 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 if (_PyUnicode_HASH(self) != -1)
10425 return _PyUnicode_HASH(self);
10426 if (PyUnicode_READY(self) == -1)
10427 return -1;
10428 len = PyUnicode_GET_LENGTH(self);
10429
10430 /* The hash function as a macro, gets expanded three times below. */
10431#define HASH(P) \
10432 x = (Py_uhash_t)*P << 7; \
10433 while (--len >= 0) \
10434 x = (1000003*x) ^ (Py_uhash_t)*P++;
10435
10436 switch (PyUnicode_KIND(self)) {
10437 case PyUnicode_1BYTE_KIND: {
10438 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10439 HASH(c);
10440 break;
10441 }
10442 case PyUnicode_2BYTE_KIND: {
10443 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10444 HASH(s);
10445 break;
10446 }
10447 default: {
10448 Py_UCS4 *l;
10449 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10450 "Impossible switch case in unicode_hash");
10451 l = PyUnicode_4BYTE_DATA(self);
10452 HASH(l);
10453 break;
10454 }
10455 }
10456 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10457
Guido van Rossumc2504932007-09-18 19:42:40 +000010458 if (x == -1)
10459 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010461 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010465PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010468Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469
10470static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010473 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010474 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010475 Py_ssize_t start;
10476 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477
Jesus Ceaac451502011-04-20 17:09:23 +020010478 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10479 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 if (PyUnicode_READY(self) == -1)
10483 return NULL;
10484 if (PyUnicode_READY(substring) == -1)
10485 return NULL;
10486
10487 result = any_find_slice(
10488 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10489 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010490 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491
10492 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (result == -2)
10495 return NULL;
10496
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 if (result < 0) {
10498 PyErr_SetString(PyExc_ValueError, "substring not found");
10499 return NULL;
10500 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010501
Christian Heimes217cfd12007-12-02 14:31:20 +000010502 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503}
10504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010505PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010506 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010508Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010509at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010510
10511static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010512unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 Py_ssize_t i, length;
10515 int kind;
10516 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517 int cased;
10518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (PyUnicode_READY(self) == -1)
10520 return NULL;
10521 length = PyUnicode_GET_LENGTH(self);
10522 kind = PyUnicode_KIND(self);
10523 data = PyUnicode_DATA(self);
10524
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (length == 1)
10527 return PyBool_FromLong(
10528 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010530 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010533
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 for (i = 0; i < length; i++) {
10536 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010537
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10539 return PyBool_FromLong(0);
10540 else if (!cased && Py_UNICODE_ISLOWER(ch))
10541 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010543 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544}
10545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010546PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010547 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010549Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010550at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551
10552static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010553unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 Py_ssize_t i, length;
10556 int kind;
10557 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558 int cased;
10559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 if (PyUnicode_READY(self) == -1)
10561 return NULL;
10562 length = PyUnicode_GET_LENGTH(self);
10563 kind = PyUnicode_KIND(self);
10564 data = PyUnicode_DATA(self);
10565
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 if (length == 1)
10568 return PyBool_FromLong(
10569 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010571 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010573 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010574
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 for (i = 0; i < length; i++) {
10577 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010578
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10580 return PyBool_FromLong(0);
10581 else if (!cased && Py_UNICODE_ISUPPER(ch))
10582 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010584 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585}
10586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010587PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010588 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010590Return True if S is a titlecased string and there is at least one\n\
10591character in S, i.e. upper- and titlecase characters may only\n\
10592follow uncased characters and lowercase characters only cased ones.\n\
10593Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594
10595static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010596unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 Py_ssize_t i, length;
10599 int kind;
10600 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 int cased, previous_is_cased;
10602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (PyUnicode_READY(self) == -1)
10604 return NULL;
10605 length = PyUnicode_GET_LENGTH(self);
10606 kind = PyUnicode_KIND(self);
10607 data = PyUnicode_DATA(self);
10608
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (length == 1) {
10611 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10612 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10613 (Py_UNICODE_ISUPPER(ch) != 0));
10614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010616 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010619
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 cased = 0;
10621 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 for (i = 0; i < length; i++) {
10623 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010624
Benjamin Peterson29060642009-01-31 22:14:21 +000010625 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10626 if (previous_is_cased)
10627 return PyBool_FromLong(0);
10628 previous_is_cased = 1;
10629 cased = 1;
10630 }
10631 else if (Py_UNICODE_ISLOWER(ch)) {
10632 if (!previous_is_cased)
10633 return PyBool_FromLong(0);
10634 previous_is_cased = 1;
10635 cased = 1;
10636 }
10637 else
10638 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010640 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641}
10642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010643PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010644 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010646Return True if all characters in S are whitespace\n\
10647and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648
10649static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010650unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 Py_ssize_t i, length;
10653 int kind;
10654 void *data;
10655
10656 if (PyUnicode_READY(self) == -1)
10657 return NULL;
10658 length = PyUnicode_GET_LENGTH(self);
10659 kind = PyUnicode_KIND(self);
10660 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 if (length == 1)
10664 return PyBool_FromLong(
10665 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010667 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 for (i = 0; i < length; i++) {
10672 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010673 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010674 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010676 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677}
10678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010679PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010680 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010681\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010682Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010683and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010684
10685static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010686unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 Py_ssize_t i, length;
10689 int kind;
10690 void *data;
10691
10692 if (PyUnicode_READY(self) == -1)
10693 return NULL;
10694 length = PyUnicode_GET_LENGTH(self);
10695 kind = PyUnicode_KIND(self);
10696 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010697
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010698 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 if (length == 1)
10700 return PyBool_FromLong(
10701 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010702
10703 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010705 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 for (i = 0; i < length; i++) {
10708 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010709 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010710 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010711 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010712}
10713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010714PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010716\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010717Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010718and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010719
10720static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010721unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 int kind;
10724 void *data;
10725 Py_ssize_t len, i;
10726
10727 if (PyUnicode_READY(self) == -1)
10728 return NULL;
10729
10730 kind = PyUnicode_KIND(self);
10731 data = PyUnicode_DATA(self);
10732 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010733
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010734 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (len == 1) {
10736 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10737 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10738 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010739
10740 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 for (i = 0; i < len; i++) {
10745 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010746 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010748 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010749 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010750}
10751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010753 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010755Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010756False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757
10758static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010759unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 Py_ssize_t i, length;
10762 int kind;
10763 void *data;
10764
10765 if (PyUnicode_READY(self) == -1)
10766 return NULL;
10767 length = PyUnicode_GET_LENGTH(self);
10768 kind = PyUnicode_KIND(self);
10769 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 if (length == 1)
10773 return PyBool_FromLong(
10774 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010776 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010778 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 for (i = 0; i < length; i++) {
10781 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010782 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010784 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785}
10786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010787PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010790Return True if all characters in S are digits\n\
10791and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792
10793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010794unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 Py_ssize_t i, length;
10797 int kind;
10798 void *data;
10799
10800 if (PyUnicode_READY(self) == -1)
10801 return NULL;
10802 length = PyUnicode_GET_LENGTH(self);
10803 kind = PyUnicode_KIND(self);
10804 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 if (length == 1) {
10808 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10809 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010812 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010814 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 for (i = 0; i < length; i++) {
10817 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010818 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010820 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821}
10822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010823PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010824 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010826Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010827False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
10829static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010830unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 Py_ssize_t i, length;
10833 int kind;
10834 void *data;
10835
10836 if (PyUnicode_READY(self) == -1)
10837 return NULL;
10838 length = PyUnicode_GET_LENGTH(self);
10839 kind = PyUnicode_KIND(self);
10840 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 if (length == 1)
10844 return PyBool_FromLong(
10845 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010847 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 for (i = 0; i < length; i++) {
10852 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010853 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010855 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856}
10857
Martin v. Löwis47383402007-08-15 07:32:56 +000010858int
10859PyUnicode_IsIdentifier(PyObject *self)
10860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 int kind;
10862 void *data;
10863 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010864 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 if (PyUnicode_READY(self) == -1) {
10867 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010868 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 }
10870
10871 /* Special case for empty strings */
10872 if (PyUnicode_GET_LENGTH(self) == 0)
10873 return 0;
10874 kind = PyUnicode_KIND(self);
10875 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010876
10877 /* PEP 3131 says that the first character must be in
10878 XID_Start and subsequent characters in XID_Continue,
10879 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010880 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010881 letters, digits, underscore). However, given the current
10882 definition of XID_Start and XID_Continue, it is sufficient
10883 to check just for these, except that _ must be allowed
10884 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010885 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010886 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010887 return 0;
10888
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010889 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010891 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010892 return 1;
10893}
10894
10895PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010896 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010897\n\
10898Return True if S is a valid identifier according\n\
10899to the language definition.");
10900
10901static PyObject*
10902unicode_isidentifier(PyObject *self)
10903{
10904 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10905}
10906
Georg Brandl559e5d72008-06-11 18:37:52 +000010907PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010909\n\
10910Return True if all characters in S are considered\n\
10911printable in repr() or S is empty, False otherwise.");
10912
10913static PyObject*
10914unicode_isprintable(PyObject *self)
10915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 Py_ssize_t i, length;
10917 int kind;
10918 void *data;
10919
10920 if (PyUnicode_READY(self) == -1)
10921 return NULL;
10922 length = PyUnicode_GET_LENGTH(self);
10923 kind = PyUnicode_KIND(self);
10924 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010925
10926 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 if (length == 1)
10928 return PyBool_FromLong(
10929 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 for (i = 0; i < length; i++) {
10932 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010933 Py_RETURN_FALSE;
10934 }
10935 }
10936 Py_RETURN_TRUE;
10937}
10938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010939PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010940 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941\n\
10942Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010943iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
10945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010946unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010948 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949}
10950
Martin v. Löwis18e16552006-02-15 17:27:45 +000010951static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952unicode_length(PyUnicodeObject *self)
10953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 if (PyUnicode_READY(self) == -1)
10955 return -1;
10956 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957}
10958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010959PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010960 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010962Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010963done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
10965static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010966unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010968 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 Py_UCS4 fillchar = ' ';
10970
10971 if (PyUnicode_READY(self) == -1)
10972 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010973
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010974 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975 return NULL;
10976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978 Py_INCREF(self);
10979 return (PyObject*) self;
10980 }
10981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983}
10984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010985PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010988Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989
10990static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010991unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 return fixup(self, fixlower);
10994}
10995
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010996#define LEFTSTRIP 0
10997#define RIGHTSTRIP 1
10998#define BOTHSTRIP 2
10999
11000/* Arrays indexed by above */
11001static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11002
11003#define STRIPNAME(i) (stripformat[i]+3)
11004
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011005/* externally visible for str.strip(unicode) */
11006PyObject *
11007_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 void *data;
11010 int kind;
11011 Py_ssize_t i, j, len;
11012 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11015 return NULL;
11016
11017 kind = PyUnicode_KIND(self);
11018 data = PyUnicode_DATA(self);
11019 len = PyUnicode_GET_LENGTH(self);
11020 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11021 PyUnicode_DATA(sepobj),
11022 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023
Benjamin Peterson14339b62009-01-31 16:36:08 +000011024 i = 0;
11025 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 while (i < len &&
11027 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 i++;
11029 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011030 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011031
Benjamin Peterson14339b62009-01-31 16:36:08 +000011032 j = len;
11033 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 do {
11035 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 } while (j >= i &&
11037 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011038 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011039 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011040
Victor Stinner12bab6d2011-10-01 01:53:49 +020011041 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042}
11043
11044PyObject*
11045PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11046{
11047 unsigned char *data;
11048 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011049 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050
Victor Stinnerde636f32011-10-01 03:55:54 +020011051 if (PyUnicode_READY(self) == -1)
11052 return NULL;
11053
11054 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11055
Victor Stinner12bab6d2011-10-01 01:53:49 +020011056 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011058 if (PyUnicode_CheckExact(self)) {
11059 Py_INCREF(self);
11060 return self;
11061 }
11062 else
11063 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 }
11065
Victor Stinner12bab6d2011-10-01 01:53:49 +020011066 length = end - start;
11067 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011068 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069
Victor Stinnerde636f32011-10-01 03:55:54 +020011070 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011071 PyErr_SetString(PyExc_IndexError, "string index out of range");
11072 return NULL;
11073 }
11074
Victor Stinnerb9275c12011-10-05 14:01:42 +020011075 if (PyUnicode_IS_ASCII(self)) {
11076 kind = PyUnicode_KIND(self);
11077 data = PyUnicode_1BYTE_DATA(self);
11078 return unicode_fromascii(data + start, length);
11079 }
11080 else {
11081 kind = PyUnicode_KIND(self);
11082 data = PyUnicode_1BYTE_DATA(self);
11083 return PyUnicode_FromKindAndData(kind,
11084 data + PyUnicode_KIND_SIZE(kind, start),
11085 length);
11086 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
11089static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011090do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 int kind;
11093 void *data;
11094 Py_ssize_t len, i, j;
11095
11096 if (PyUnicode_READY(self) == -1)
11097 return NULL;
11098
11099 kind = PyUnicode_KIND(self);
11100 data = PyUnicode_DATA(self);
11101 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011102
Benjamin Peterson14339b62009-01-31 16:36:08 +000011103 i = 0;
11104 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011106 i++;
11107 }
11108 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011109
Benjamin Peterson14339b62009-01-31 16:36:08 +000011110 j = len;
11111 if (striptype != LEFTSTRIP) {
11112 do {
11113 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011115 j++;
11116 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011117
Victor Stinner12bab6d2011-10-01 01:53:49 +020011118 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119}
11120
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011121
11122static PyObject *
11123do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011125 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011126
Benjamin Peterson14339b62009-01-31 16:36:08 +000011127 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11128 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011129
Benjamin Peterson14339b62009-01-31 16:36:08 +000011130 if (sep != NULL && sep != Py_None) {
11131 if (PyUnicode_Check(sep))
11132 return _PyUnicode_XStrip(self, striptype, sep);
11133 else {
11134 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011135 "%s arg must be None or str",
11136 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011137 return NULL;
11138 }
11139 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011140
Benjamin Peterson14339b62009-01-31 16:36:08 +000011141 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011142}
11143
11144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011145PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011147\n\
11148Return a copy of the string S with leading and trailing\n\
11149whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011150If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011151
11152static PyObject *
11153unicode_strip(PyUnicodeObject *self, PyObject *args)
11154{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011155 if (PyTuple_GET_SIZE(args) == 0)
11156 return do_strip(self, BOTHSTRIP); /* Common case */
11157 else
11158 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011159}
11160
11161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011162PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011163 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011164\n\
11165Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011166If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011167
11168static PyObject *
11169unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11170{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011171 if (PyTuple_GET_SIZE(args) == 0)
11172 return do_strip(self, LEFTSTRIP); /* Common case */
11173 else
11174 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011175}
11176
11177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011178PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011180\n\
11181Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011182If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011183
11184static PyObject *
11185unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11186{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011187 if (PyTuple_GET_SIZE(args) == 0)
11188 return do_strip(self, RIGHTSTRIP); /* Common case */
11189 else
11190 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011191}
11192
11193
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011195unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196{
11197 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
Georg Brandl222de0f2009-04-12 12:01:50 +000011200 if (len < 1) {
11201 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011202 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204
Tim Peters7a29bd52001-09-12 03:03:31 +000011205 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 /* no repeat, return original string */
11207 Py_INCREF(str);
11208 return (PyObject*) str;
11209 }
Tim Peters8f422462000-09-09 06:13:41 +000011210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (PyUnicode_READY(str) == -1)
11212 return NULL;
11213
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011214 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011215 PyErr_SetString(PyExc_OverflowError,
11216 "repeated string is too long");
11217 return NULL;
11218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222 if (!u)
11223 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011224 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 if (PyUnicode_GET_LENGTH(str) == 1) {
11227 const int kind = PyUnicode_KIND(str);
11228 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11229 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011230 if (kind == PyUnicode_1BYTE_KIND)
11231 memset(to, (unsigned char)fill_char, len);
11232 else {
11233 for (n = 0; n < len; ++n)
11234 PyUnicode_WRITE(kind, to, n, fill_char);
11235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 }
11237 else {
11238 /* number of characters copied this far */
11239 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11240 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11241 char *to = (char *) PyUnicode_DATA(u);
11242 Py_MEMCPY(to, PyUnicode_DATA(str),
11243 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 n = (done <= nchars-done) ? done : nchars-done;
11246 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011247 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249 }
11250
11251 return (PyObject*) u;
11252}
11253
Alexander Belopolsky40018472011-02-26 01:02:56 +000011254PyObject *
11255PyUnicode_Replace(PyObject *obj,
11256 PyObject *subobj,
11257 PyObject *replobj,
11258 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259{
11260 PyObject *self;
11261 PyObject *str1;
11262 PyObject *str2;
11263 PyObject *result;
11264
11265 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011266 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011269 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 Py_DECREF(self);
11271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 }
11273 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011274 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 Py_DECREF(self);
11276 Py_DECREF(str1);
11277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011279 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 Py_DECREF(self);
11281 Py_DECREF(str1);
11282 Py_DECREF(str2);
11283 return result;
11284}
11285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011286PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011287 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288\n\
11289Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011290old replaced by new. If the optional argument count is\n\
11291given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
11293static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 PyObject *str1;
11297 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011298 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299 PyObject *result;
11300
Martin v. Löwis18e16552006-02-15 17:27:45 +000011301 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011304 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 str1 = PyUnicode_FromObject(str1);
11306 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11307 return NULL;
11308 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011309 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 Py_DECREF(str1);
11311 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011312 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
11314 result = replace(self, str1, str2, maxcount);
11315
11316 Py_DECREF(str1);
11317 Py_DECREF(str2);
11318 return result;
11319}
11320
Alexander Belopolsky40018472011-02-26 01:02:56 +000011321static PyObject *
11322unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011324 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 Py_ssize_t isize;
11326 Py_ssize_t osize, squote, dquote, i, o;
11327 Py_UCS4 max, quote;
11328 int ikind, okind;
11329 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011332 return NULL;
11333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 isize = PyUnicode_GET_LENGTH(unicode);
11335 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 /* Compute length of output, quote characters, and
11338 maximum character */
11339 osize = 2; /* quotes */
11340 max = 127;
11341 squote = dquote = 0;
11342 ikind = PyUnicode_KIND(unicode);
11343 for (i = 0; i < isize; i++) {
11344 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11345 switch (ch) {
11346 case '\'': squote++; osize++; break;
11347 case '"': dquote++; osize++; break;
11348 case '\\': case '\t': case '\r': case '\n':
11349 osize += 2; break;
11350 default:
11351 /* Fast-path ASCII */
11352 if (ch < ' ' || ch == 0x7f)
11353 osize += 4; /* \xHH */
11354 else if (ch < 0x7f)
11355 osize++;
11356 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11357 osize++;
11358 max = ch > max ? ch : max;
11359 }
11360 else if (ch < 0x100)
11361 osize += 4; /* \xHH */
11362 else if (ch < 0x10000)
11363 osize += 6; /* \uHHHH */
11364 else
11365 osize += 10; /* \uHHHHHHHH */
11366 }
11367 }
11368
11369 quote = '\'';
11370 if (squote) {
11371 if (dquote)
11372 /* Both squote and dquote present. Use squote,
11373 and escape them */
11374 osize += squote;
11375 else
11376 quote = '"';
11377 }
11378
11379 repr = PyUnicode_New(osize, max);
11380 if (repr == NULL)
11381 return NULL;
11382 okind = PyUnicode_KIND(repr);
11383 odata = PyUnicode_DATA(repr);
11384
11385 PyUnicode_WRITE(okind, odata, 0, quote);
11386 PyUnicode_WRITE(okind, odata, osize-1, quote);
11387
11388 for (i = 0, o = 1; i < isize; i++) {
11389 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011390
11391 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 if ((ch == quote) || (ch == '\\')) {
11393 PyUnicode_WRITE(okind, odata, o++, '\\');
11394 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011395 continue;
11396 }
11397
Benjamin Peterson29060642009-01-31 22:14:21 +000011398 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011399 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 PyUnicode_WRITE(okind, odata, o++, '\\');
11401 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011402 }
11403 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 PyUnicode_WRITE(okind, odata, o++, '\\');
11405 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011406 }
11407 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 PyUnicode_WRITE(okind, odata, o++, '\\');
11409 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011410 }
11411
11412 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011413 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 PyUnicode_WRITE(okind, odata, o++, '\\');
11415 PyUnicode_WRITE(okind, odata, o++, 'x');
11416 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11417 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011418 }
11419
Georg Brandl559e5d72008-06-11 18:37:52 +000011420 /* Copy ASCII characters as-is */
11421 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011423 }
11424
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011426 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011427 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011428 (categories Z* and C* except ASCII space)
11429 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011431 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 if (ch <= 0xff) {
11433 PyUnicode_WRITE(okind, odata, o++, '\\');
11434 PyUnicode_WRITE(okind, odata, o++, 'x');
11435 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11436 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011437 }
11438 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 else if (ch >= 0x10000) {
11440 PyUnicode_WRITE(okind, odata, o++, '\\');
11441 PyUnicode_WRITE(okind, odata, o++, 'U');
11442 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11443 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11444 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11445 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11446 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11447 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11448 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11449 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011450 }
11451 /* Map 16-bit characters to '\uxxxx' */
11452 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 PyUnicode_WRITE(okind, odata, o++, '\\');
11454 PyUnicode_WRITE(okind, odata, o++, 'u');
11455 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11456 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11457 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11458 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011459 }
11460 }
11461 /* Copy characters as-is */
11462 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011464 }
11465 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011468 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469}
11470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011471PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473\n\
11474Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011475such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476arguments start and end are interpreted as in slice notation.\n\
11477\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011478Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
11480static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482{
Jesus Ceaac451502011-04-20 17:09:23 +020011483 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011484 Py_ssize_t start;
11485 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011486 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Jesus Ceaac451502011-04-20 17:09:23 +020011488 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11489 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 if (PyUnicode_READY(self) == -1)
11493 return NULL;
11494 if (PyUnicode_READY(substring) == -1)
11495 return NULL;
11496
11497 result = any_find_slice(
11498 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11499 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011500 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
11502 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (result == -2)
11505 return NULL;
11506
Christian Heimes217cfd12007-12-02 14:31:20 +000011507 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508}
11509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011510PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011513Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
11515static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517{
Jesus Ceaac451502011-04-20 17:09:23 +020011518 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011519 Py_ssize_t start;
11520 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011521 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522
Jesus Ceaac451502011-04-20 17:09:23 +020011523 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11524 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 if (PyUnicode_READY(self) == -1)
11528 return NULL;
11529 if (PyUnicode_READY(substring) == -1)
11530 return NULL;
11531
11532 result = any_find_slice(
11533 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11534 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011535 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
11537 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 if (result == -2)
11540 return NULL;
11541
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 if (result < 0) {
11543 PyErr_SetString(PyExc_ValueError, "substring not found");
11544 return NULL;
11545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546
Christian Heimes217cfd12007-12-02 14:31:20 +000011547 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548}
11549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011550PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011553Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011554done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
11556static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011557unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011559 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 Py_UCS4 fillchar = ' ';
11561
Victor Stinnere9a29352011-10-01 02:14:59 +020011562 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011564
Victor Stinnere9a29352011-10-01 02:14:59 +020011565 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 return NULL;
11567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569 Py_INCREF(self);
11570 return (PyObject*) self;
11571 }
11572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574}
11575
Alexander Belopolsky40018472011-02-26 01:02:56 +000011576PyObject *
11577PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578{
11579 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011580
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581 s = PyUnicode_FromObject(s);
11582 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011583 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 if (sep != NULL) {
11585 sep = PyUnicode_FromObject(sep);
11586 if (sep == NULL) {
11587 Py_DECREF(s);
11588 return NULL;
11589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590 }
11591
Victor Stinner9310abb2011-10-05 00:59:23 +020011592 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
11594 Py_DECREF(s);
11595 Py_XDECREF(sep);
11596 return result;
11597}
11598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011599PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601\n\
11602Return a list of the words in S, using sep as the\n\
11603delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011604splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011605whitespace string is a separator and empty strings are\n\
11606removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607
11608static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011609unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610{
11611 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011612 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613
Martin v. Löwis18e16552006-02-15 17:27:45 +000011614 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615 return NULL;
11616
11617 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011620 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623}
11624
Thomas Wouters477c8d52006-05-27 19:21:47 +000011625PyObject *
11626PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11627{
11628 PyObject* str_obj;
11629 PyObject* sep_obj;
11630 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 int kind1, kind2, kind;
11632 void *buf1 = NULL, *buf2 = NULL;
11633 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011634
11635 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011636 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011638 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011640 Py_DECREF(str_obj);
11641 return NULL;
11642 }
11643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 kind1 = PyUnicode_KIND(str_in);
11645 kind2 = PyUnicode_KIND(sep_obj);
11646 kind = kind1 > kind2 ? kind1 : kind2;
11647 buf1 = PyUnicode_DATA(str_in);
11648 if (kind1 != kind)
11649 buf1 = _PyUnicode_AsKind(str_in, kind);
11650 if (!buf1)
11651 goto onError;
11652 buf2 = PyUnicode_DATA(sep_obj);
11653 if (kind2 != kind)
11654 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11655 if (!buf2)
11656 goto onError;
11657 len1 = PyUnicode_GET_LENGTH(str_obj);
11658 len2 = PyUnicode_GET_LENGTH(sep_obj);
11659
11660 switch(PyUnicode_KIND(str_in)) {
11661 case PyUnicode_1BYTE_KIND:
11662 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11663 break;
11664 case PyUnicode_2BYTE_KIND:
11665 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11666 break;
11667 case PyUnicode_4BYTE_KIND:
11668 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11669 break;
11670 default:
11671 assert(0);
11672 out = 0;
11673 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011674
11675 Py_DECREF(sep_obj);
11676 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 if (kind1 != kind)
11678 PyMem_Free(buf1);
11679 if (kind2 != kind)
11680 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011681
11682 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 onError:
11684 Py_DECREF(sep_obj);
11685 Py_DECREF(str_obj);
11686 if (kind1 != kind && buf1)
11687 PyMem_Free(buf1);
11688 if (kind2 != kind && buf2)
11689 PyMem_Free(buf2);
11690 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011691}
11692
11693
11694PyObject *
11695PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11696{
11697 PyObject* str_obj;
11698 PyObject* sep_obj;
11699 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 int kind1, kind2, kind;
11701 void *buf1 = NULL, *buf2 = NULL;
11702 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011703
11704 str_obj = PyUnicode_FromObject(str_in);
11705 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011707 sep_obj = PyUnicode_FromObject(sep_in);
11708 if (!sep_obj) {
11709 Py_DECREF(str_obj);
11710 return NULL;
11711 }
11712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 kind1 = PyUnicode_KIND(str_in);
11714 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011715 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 buf1 = PyUnicode_DATA(str_in);
11717 if (kind1 != kind)
11718 buf1 = _PyUnicode_AsKind(str_in, kind);
11719 if (!buf1)
11720 goto onError;
11721 buf2 = PyUnicode_DATA(sep_obj);
11722 if (kind2 != kind)
11723 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11724 if (!buf2)
11725 goto onError;
11726 len1 = PyUnicode_GET_LENGTH(str_obj);
11727 len2 = PyUnicode_GET_LENGTH(sep_obj);
11728
11729 switch(PyUnicode_KIND(str_in)) {
11730 case PyUnicode_1BYTE_KIND:
11731 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11732 break;
11733 case PyUnicode_2BYTE_KIND:
11734 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11735 break;
11736 case PyUnicode_4BYTE_KIND:
11737 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11738 break;
11739 default:
11740 assert(0);
11741 out = 0;
11742 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011743
11744 Py_DECREF(sep_obj);
11745 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (kind1 != kind)
11747 PyMem_Free(buf1);
11748 if (kind2 != kind)
11749 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011750
11751 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 onError:
11753 Py_DECREF(sep_obj);
11754 Py_DECREF(str_obj);
11755 if (kind1 != kind && buf1)
11756 PyMem_Free(buf1);
11757 if (kind2 != kind && buf2)
11758 PyMem_Free(buf2);
11759 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011760}
11761
11762PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011764\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011765Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011766the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011767found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011768
11769static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011770unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011771{
Victor Stinner9310abb2011-10-05 00:59:23 +020011772 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011773}
11774
11775PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011776 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011777\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011778Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011779the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011780separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011781
11782static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011783unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011784{
Victor Stinner9310abb2011-10-05 00:59:23 +020011785 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011786}
11787
Alexander Belopolsky40018472011-02-26 01:02:56 +000011788PyObject *
11789PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011790{
11791 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011792
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011793 s = PyUnicode_FromObject(s);
11794 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 if (sep != NULL) {
11797 sep = PyUnicode_FromObject(sep);
11798 if (sep == NULL) {
11799 Py_DECREF(s);
11800 return NULL;
11801 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011802 }
11803
Victor Stinner9310abb2011-10-05 00:59:23 +020011804 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011805
11806 Py_DECREF(s);
11807 Py_XDECREF(sep);
11808 return result;
11809}
11810
11811PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011813\n\
11814Return a list of the words in S, using sep as the\n\
11815delimiter string, starting at the end of the string and\n\
11816working to the front. If maxsplit is given, at most maxsplit\n\
11817splits are done. If sep is not specified, any whitespace string\n\
11818is a separator.");
11819
11820static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011821unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011822{
11823 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011824 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011825
Martin v. Löwis18e16552006-02-15 17:27:45 +000011826 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011827 return NULL;
11828
11829 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011831 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011832 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011833 else
Victor Stinner9310abb2011-10-05 00:59:23 +020011834 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011835}
11836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011837PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839\n\
11840Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011841Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011842is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
11844static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011845unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011847 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011848 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011850 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11851 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 return NULL;
11853
Guido van Rossum86662912000-04-11 15:38:46 +000011854 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855}
11856
11857static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011858PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859{
Walter Dörwald346737f2007-05-31 10:44:43 +000011860 if (PyUnicode_CheckExact(self)) {
11861 Py_INCREF(self);
11862 return self;
11863 } else
11864 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011865 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866}
11867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011868PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870\n\
11871Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011872and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873
11874static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011875unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877 return fixup(self, fixswapcase);
11878}
11879
Georg Brandlceee0772007-11-27 23:48:05 +000011880PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011882\n\
11883Return a translation table usable for str.translate().\n\
11884If there is only one argument, it must be a dictionary mapping Unicode\n\
11885ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011886Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011887If there are two arguments, they must be strings of equal length, and\n\
11888in the resulting dictionary, each character in x will be mapped to the\n\
11889character at the same position in y. If there is a third argument, it\n\
11890must be a string, whose characters will be mapped to None in the result.");
11891
11892static PyObject*
11893unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11894{
11895 PyObject *x, *y = NULL, *z = NULL;
11896 PyObject *new = NULL, *key, *value;
11897 Py_ssize_t i = 0;
11898 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011899
Georg Brandlceee0772007-11-27 23:48:05 +000011900 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11901 return NULL;
11902 new = PyDict_New();
11903 if (!new)
11904 return NULL;
11905 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 int x_kind, y_kind, z_kind;
11907 void *x_data, *y_data, *z_data;
11908
Georg Brandlceee0772007-11-27 23:48:05 +000011909 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011910 if (!PyUnicode_Check(x)) {
11911 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11912 "be a string if there is a second argument");
11913 goto err;
11914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011916 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11917 "arguments must have equal length");
11918 goto err;
11919 }
11920 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 x_kind = PyUnicode_KIND(x);
11922 y_kind = PyUnicode_KIND(y);
11923 x_data = PyUnicode_DATA(x);
11924 y_data = PyUnicode_DATA(y);
11925 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11926 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11927 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011928 if (!key || !value)
11929 goto err;
11930 res = PyDict_SetItem(new, key, value);
11931 Py_DECREF(key);
11932 Py_DECREF(value);
11933 if (res < 0)
11934 goto err;
11935 }
11936 /* create entries for deleting chars in z */
11937 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 z_kind = PyUnicode_KIND(z);
11939 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011940 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011942 if (!key)
11943 goto err;
11944 res = PyDict_SetItem(new, key, Py_None);
11945 Py_DECREF(key);
11946 if (res < 0)
11947 goto err;
11948 }
11949 }
11950 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 int kind;
11952 void *data;
11953
Georg Brandlceee0772007-11-27 23:48:05 +000011954 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011955 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011956 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11957 "to maketrans it must be a dict");
11958 goto err;
11959 }
11960 /* copy entries into the new dict, converting string keys to int keys */
11961 while (PyDict_Next(x, &i, &key, &value)) {
11962 if (PyUnicode_Check(key)) {
11963 /* convert string keys to integer keys */
11964 PyObject *newkey;
11965 if (PyUnicode_GET_SIZE(key) != 1) {
11966 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11967 "table must be of length 1");
11968 goto err;
11969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 kind = PyUnicode_KIND(key);
11971 data = PyUnicode_DATA(key);
11972 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011973 if (!newkey)
11974 goto err;
11975 res = PyDict_SetItem(new, newkey, value);
11976 Py_DECREF(newkey);
11977 if (res < 0)
11978 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011979 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011980 /* just keep integer keys */
11981 if (PyDict_SetItem(new, key, value) < 0)
11982 goto err;
11983 } else {
11984 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11985 "be strings or integers");
11986 goto err;
11987 }
11988 }
11989 }
11990 return new;
11991 err:
11992 Py_DECREF(new);
11993 return NULL;
11994}
11995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011996PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998\n\
11999Return a copy of the string S, where all characters have been mapped\n\
12000through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012001Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012002Unmapped characters are left untouched. Characters mapped to None\n\
12003are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004
12005static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009}
12010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012011PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012014Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
12016static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012017unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 return fixup(self, fixupper);
12020}
12021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012022PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012025Pad a numeric string S with zeros on the left, to fill a field\n\
12026of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027
12028static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012029unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012031 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012032 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012033 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 int kind;
12035 void *data;
12036 Py_UCS4 chr;
12037
12038 if (PyUnicode_READY(self) == -1)
12039 return NULL;
12040
Martin v. Löwis18e16552006-02-15 17:27:45 +000012041 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042 return NULL;
12043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012045 if (PyUnicode_CheckExact(self)) {
12046 Py_INCREF(self);
12047 return (PyObject*) self;
12048 }
12049 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012050 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051 }
12052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
12055 u = pad(self, fill, 0, '0');
12056
Walter Dörwald068325e2002-04-15 13:36:47 +000012057 if (u == NULL)
12058 return NULL;
12059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 kind = PyUnicode_KIND(u);
12061 data = PyUnicode_DATA(u);
12062 chr = PyUnicode_READ(kind, data, fill);
12063
12064 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 PyUnicode_WRITE(kind, data, 0, chr);
12067 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068 }
12069
12070 return (PyObject*) u;
12071}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072
12073#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012074static PyObject *
12075unicode__decimal2ascii(PyObject *self)
12076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012078}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079#endif
12080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012081PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012082 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012084Return True if S starts with the specified prefix, False otherwise.\n\
12085With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012086With optional end, stop comparing S at that position.\n\
12087prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
12089static PyObject *
12090unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012093 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012095 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012096 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012097 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
Jesus Ceaac451502011-04-20 17:09:23 +020012099 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012101 if (PyTuple_Check(subobj)) {
12102 Py_ssize_t i;
12103 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12104 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012106 if (substring == NULL)
12107 return NULL;
12108 result = tailmatch(self, substring, start, end, -1);
12109 Py_DECREF(substring);
12110 if (result) {
12111 Py_RETURN_TRUE;
12112 }
12113 }
12114 /* nothing matched */
12115 Py_RETURN_FALSE;
12116 }
12117 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012118 if (substring == NULL) {
12119 if (PyErr_ExceptionMatches(PyExc_TypeError))
12120 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12121 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012122 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012123 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012124 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012126 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127}
12128
12129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012130PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012133Return True if S ends with the specified suffix, False otherwise.\n\
12134With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012135With optional end, stop comparing S at that position.\n\
12136suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137
12138static PyObject *
12139unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012142 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012144 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012145 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012146 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147
Jesus Ceaac451502011-04-20 17:09:23 +020012148 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012150 if (PyTuple_Check(subobj)) {
12151 Py_ssize_t i;
12152 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12153 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012155 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012157 result = tailmatch(self, substring, start, end, +1);
12158 Py_DECREF(substring);
12159 if (result) {
12160 Py_RETURN_TRUE;
12161 }
12162 }
12163 Py_RETURN_FALSE;
12164 }
12165 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012166 if (substring == NULL) {
12167 if (PyErr_ExceptionMatches(PyExc_TypeError))
12168 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12169 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012171 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012172 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012174 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175}
12176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012178
12179PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012181\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012182Return a formatted version of S, using substitutions from args and kwargs.\n\
12183The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012184
Eric Smith27bbca62010-11-04 17:06:58 +000012185PyDoc_STRVAR(format_map__doc__,
12186 "S.format_map(mapping) -> str\n\
12187\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012188Return a formatted version of S, using substitutions from mapping.\n\
12189The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012190
Eric Smith4a7d76d2008-05-30 18:10:19 +000012191static PyObject *
12192unicode__format__(PyObject* self, PyObject* args)
12193{
12194 PyObject *format_spec;
12195
12196 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12197 return NULL;
12198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12200 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012201}
12202
Eric Smith8c663262007-08-25 02:26:07 +000012203PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012204 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012205\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012206Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012207
12208static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012209unicode__sizeof__(PyUnicodeObject *v)
12210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 Py_ssize_t size;
12212
12213 /* If it's a compact object, account for base structure +
12214 character data. */
12215 if (PyUnicode_IS_COMPACT_ASCII(v))
12216 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12217 else if (PyUnicode_IS_COMPACT(v))
12218 size = sizeof(PyCompactUnicodeObject) +
12219 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12220 else {
12221 /* If it is a two-block object, account for base object, and
12222 for character block if present. */
12223 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012224 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 size += (PyUnicode_GET_LENGTH(v) + 1) *
12226 PyUnicode_CHARACTER_SIZE(v);
12227 }
12228 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012229 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012230 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012232 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012233 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234
12235 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012236}
12237
12238PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012240
12241static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012242unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012243{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012244 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 if (!copy)
12246 return NULL;
12247 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012248}
12249
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250static PyMethodDef unicode_methods[] = {
12251
12252 /* Order is according to common usage: often used methods should
12253 appear first, since lookup is done sequentially. */
12254
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012255 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012256 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12257 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012258 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012259 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12260 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12261 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12262 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12263 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12264 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12265 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012266 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012267 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12268 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12269 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012270 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012271 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12272 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12273 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012274 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012275 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012276 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012278 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12279 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12280 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12281 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12282 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12283 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12284 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12285 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12286 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12287 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12288 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12289 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12290 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12291 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012292 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012293 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012294 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012295 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012296 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012297 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012298 {"maketrans", (PyCFunction) unicode_maketrans,
12299 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012300 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012301#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012302 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303#endif
12304
12305#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012306 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012307 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308#endif
12309
Benjamin Peterson14339b62009-01-31 16:36:08 +000012310 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311 {NULL, NULL}
12312};
12313
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012314static PyObject *
12315unicode_mod(PyObject *v, PyObject *w)
12316{
Brian Curtindfc80e32011-08-10 20:28:54 -050012317 if (!PyUnicode_Check(v))
12318 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012320}
12321
12322static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 0, /*nb_add*/
12324 0, /*nb_subtract*/
12325 0, /*nb_multiply*/
12326 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012327};
12328
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012330 (lenfunc) unicode_length, /* sq_length */
12331 PyUnicode_Concat, /* sq_concat */
12332 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12333 (ssizeargfunc) unicode_getitem, /* sq_item */
12334 0, /* sq_slice */
12335 0, /* sq_ass_item */
12336 0, /* sq_ass_slice */
12337 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338};
12339
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012340static PyObject*
12341unicode_subscript(PyUnicodeObject* self, PyObject* item)
12342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 if (PyUnicode_READY(self) == -1)
12344 return NULL;
12345
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012346 if (PyIndex_Check(item)) {
12347 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012348 if (i == -1 && PyErr_Occurred())
12349 return NULL;
12350 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012352 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012353 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012354 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012355 PyObject *result;
12356 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012357 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012358 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012362 return NULL;
12363 }
12364
12365 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 return PyUnicode_New(0, 0);
12367 } else if (start == 0 && step == 1 &&
12368 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012369 PyUnicode_CheckExact(self)) {
12370 Py_INCREF(self);
12371 return (PyObject *)self;
12372 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012373 return PyUnicode_Substring((PyObject*)self,
12374 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012375 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012376 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012377 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012378 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012379 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012380 src_data = PyUnicode_DATA(self);
12381 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12382 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012383 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012384 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012385 if (max_char >= kind_limit)
12386 break;
12387 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012388 }
12389 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012390 if (result == NULL)
12391 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012392 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012393 dest_data = PyUnicode_DATA(result);
12394
12395 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012396 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12397 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012398 }
12399 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012400 } else {
12401 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12402 return NULL;
12403 }
12404}
12405
12406static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 (lenfunc)unicode_length, /* mp_length */
12408 (binaryfunc)unicode_subscript, /* mp_subscript */
12409 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012410};
12411
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413/* Helpers for PyUnicode_Format() */
12414
12415static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012416getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012418 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012420 (*p_argidx)++;
12421 if (arglen < 0)
12422 return args;
12423 else
12424 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425 }
12426 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012427 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428 return NULL;
12429}
12430
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012431/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012433static PyObject *
12434formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012436 char *p;
12437 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012439
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440 x = PyFloat_AsDouble(v);
12441 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012442 return NULL;
12443
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012446
Eric Smith0923d1d2009-04-16 20:16:10 +000012447 p = PyOS_double_to_string(x, type, prec,
12448 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012449 if (p == NULL)
12450 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012452 PyMem_Free(p);
12453 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454}
12455
Tim Peters38fd5b62000-09-21 05:43:11 +000012456static PyObject*
12457formatlong(PyObject *val, int flags, int prec, int type)
12458{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 char *buf;
12460 int len;
12461 PyObject *str; /* temporary string object. */
12462 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012463
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12465 if (!str)
12466 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012468 Py_DECREF(str);
12469 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012470}
12471
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012474 size_t buflen,
12475 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012477 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012478 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 if (PyUnicode_GET_LENGTH(v) == 1) {
12480 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 buf[1] = '\0';
12482 return 1;
12483 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 goto onError;
12485 }
12486 else {
12487 /* Integer input truncated to a character */
12488 long x;
12489 x = PyLong_AsLong(v);
12490 if (x == -1 && PyErr_Occurred())
12491 goto onError;
12492
12493 if (x < 0 || x > 0x10ffff) {
12494 PyErr_SetString(PyExc_OverflowError,
12495 "%c arg not in range(0x110000)");
12496 return -1;
12497 }
12498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 buf[1] = '\0';
12501 return 1;
12502 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012503
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012505 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012507 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508}
12509
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012510/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012511 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012512*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012513#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012514
Alexander Belopolsky40018472011-02-26 01:02:56 +000012515PyObject *
12516PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 void *fmt;
12519 int fmtkind;
12520 PyObject *result;
12521 Py_UCS4 *res, *res0;
12522 Py_UCS4 max;
12523 int kind;
12524 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012528
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 PyErr_BadInternalCall();
12531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12534 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 fmt = PyUnicode_DATA(uformat);
12537 fmtkind = PyUnicode_KIND(uformat);
12538 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12539 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
12541 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12543 if (res0 == NULL) {
12544 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
12548 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012549 arglen = PyTuple_Size(args);
12550 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551 }
12552 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 arglen = -1;
12554 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012556 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012557 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
12560 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 if (--rescnt < 0) {
12563 rescnt = fmtcnt + 100;
12564 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12566 if (res0 == NULL){
12567 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 }
12570 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012574 }
12575 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 /* Got a format specifier */
12577 int flags = 0;
12578 Py_ssize_t width = -1;
12579 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 Py_UCS4 c = '\0';
12581 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 int isnumok;
12583 PyObject *v = NULL;
12584 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 void *pbuf;
12586 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012587 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 Py_ssize_t len, len1;
12589 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 fmtpos++;
12592 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12593 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 Py_ssize_t keylen;
12595 PyObject *key;
12596 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012597
Benjamin Peterson29060642009-01-31 22:14:21 +000012598 if (dict == NULL) {
12599 PyErr_SetString(PyExc_TypeError,
12600 "format requires a mapping");
12601 goto onError;
12602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012604 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 /* Skip over balanced parentheses */
12607 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012609 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 if (fmtcnt < 0 || pcount > 0) {
12616 PyErr_SetString(PyExc_ValueError,
12617 "incomplete format key");
12618 goto onError;
12619 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012620 key = PyUnicode_Substring((PyObject*)uformat,
12621 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 if (key == NULL)
12623 goto onError;
12624 if (args_owned) {
12625 Py_DECREF(args);
12626 args_owned = 0;
12627 }
12628 args = PyObject_GetItem(dict, key);
12629 Py_DECREF(key);
12630 if (args == NULL) {
12631 goto onError;
12632 }
12633 args_owned = 1;
12634 arglen = -1;
12635 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012636 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 case '-': flags |= F_LJUST; continue;
12640 case '+': flags |= F_SIGN; continue;
12641 case ' ': flags |= F_BLANK; continue;
12642 case '#': flags |= F_ALT; continue;
12643 case '0': flags |= F_ZERO; continue;
12644 }
12645 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012646 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012647 if (c == '*') {
12648 v = getnextarg(args, arglen, &argidx);
12649 if (v == NULL)
12650 goto onError;
12651 if (!PyLong_Check(v)) {
12652 PyErr_SetString(PyExc_TypeError,
12653 "* wants int");
12654 goto onError;
12655 }
12656 width = PyLong_AsLong(v);
12657 if (width == -1 && PyErr_Occurred())
12658 goto onError;
12659 if (width < 0) {
12660 flags |= F_LJUST;
12661 width = -width;
12662 }
12663 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 }
12666 else if (c >= '0' && c <= '9') {
12667 width = c - '0';
12668 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 if (c < '0' || c > '9')
12671 break;
12672 if ((width*10) / 10 != width) {
12673 PyErr_SetString(PyExc_ValueError,
12674 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012675 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 }
12677 width = width*10 + (c - '0');
12678 }
12679 }
12680 if (c == '.') {
12681 prec = 0;
12682 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 if (c == '*') {
12685 v = getnextarg(args, arglen, &argidx);
12686 if (v == NULL)
12687 goto onError;
12688 if (!PyLong_Check(v)) {
12689 PyErr_SetString(PyExc_TypeError,
12690 "* wants int");
12691 goto onError;
12692 }
12693 prec = PyLong_AsLong(v);
12694 if (prec == -1 && PyErr_Occurred())
12695 goto onError;
12696 if (prec < 0)
12697 prec = 0;
12698 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 }
12701 else if (c >= '0' && c <= '9') {
12702 prec = c - '0';
12703 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012705 if (c < '0' || c > '9')
12706 break;
12707 if ((prec*10) / 10 != prec) {
12708 PyErr_SetString(PyExc_ValueError,
12709 "prec too big");
12710 goto onError;
12711 }
12712 prec = prec*10 + (c - '0');
12713 }
12714 }
12715 } /* prec */
12716 if (fmtcnt >= 0) {
12717 if (c == 'h' || c == 'l' || c == 'L') {
12718 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 }
12721 }
12722 if (fmtcnt < 0) {
12723 PyErr_SetString(PyExc_ValueError,
12724 "incomplete format");
12725 goto onError;
12726 }
12727 if (c != '%') {
12728 v = getnextarg(args, arglen, &argidx);
12729 if (v == NULL)
12730 goto onError;
12731 }
12732 sign = 0;
12733 fill = ' ';
12734 switch (c) {
12735
12736 case '%':
12737 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 len = 1;
12742 break;
12743
12744 case 's':
12745 case 'r':
12746 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012747 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 temp = v;
12749 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012750 }
12751 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012752 if (c == 's')
12753 temp = PyObject_Str(v);
12754 else if (c == 'r')
12755 temp = PyObject_Repr(v);
12756 else
12757 temp = PyObject_ASCII(v);
12758 if (temp == NULL)
12759 goto onError;
12760 if (PyUnicode_Check(temp))
12761 /* nothing to do */;
12762 else {
12763 Py_DECREF(temp);
12764 PyErr_SetString(PyExc_TypeError,
12765 "%s argument has non-string str()");
12766 goto onError;
12767 }
12768 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 if (PyUnicode_READY(temp) == -1) {
12770 Py_CLEAR(temp);
12771 goto onError;
12772 }
12773 pbuf = PyUnicode_DATA(temp);
12774 kind = PyUnicode_KIND(temp);
12775 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012776 if (prec >= 0 && len > prec)
12777 len = prec;
12778 break;
12779
12780 case 'i':
12781 case 'd':
12782 case 'u':
12783 case 'o':
12784 case 'x':
12785 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 isnumok = 0;
12787 if (PyNumber_Check(v)) {
12788 PyObject *iobj=NULL;
12789
12790 if (PyLong_Check(v)) {
12791 iobj = v;
12792 Py_INCREF(iobj);
12793 }
12794 else {
12795 iobj = PyNumber_Long(v);
12796 }
12797 if (iobj!=NULL) {
12798 if (PyLong_Check(iobj)) {
12799 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012800 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 Py_DECREF(iobj);
12802 if (!temp)
12803 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 if (PyUnicode_READY(temp) == -1) {
12805 Py_CLEAR(temp);
12806 goto onError;
12807 }
12808 pbuf = PyUnicode_DATA(temp);
12809 kind = PyUnicode_KIND(temp);
12810 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 sign = 1;
12812 }
12813 else {
12814 Py_DECREF(iobj);
12815 }
12816 }
12817 }
12818 if (!isnumok) {
12819 PyErr_Format(PyExc_TypeError,
12820 "%%%c format: a number is required, "
12821 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12822 goto onError;
12823 }
12824 if (flags & F_ZERO)
12825 fill = '0';
12826 break;
12827
12828 case 'e':
12829 case 'E':
12830 case 'f':
12831 case 'F':
12832 case 'g':
12833 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012834 temp = formatfloat(v, flags, prec, c);
12835 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837 if (PyUnicode_READY(temp) == -1) {
12838 Py_CLEAR(temp);
12839 goto onError;
12840 }
12841 pbuf = PyUnicode_DATA(temp);
12842 kind = PyUnicode_KIND(temp);
12843 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012844 sign = 1;
12845 if (flags & F_ZERO)
12846 fill = '0';
12847 break;
12848
12849 case 'c':
12850 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012852 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 if (len < 0)
12854 goto onError;
12855 break;
12856
12857 default:
12858 PyErr_Format(PyExc_ValueError,
12859 "unsupported format character '%c' (0x%x) "
12860 "at index %zd",
12861 (31<=c && c<=126) ? (char)c : '?',
12862 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 goto onError;
12865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 /* pbuf is initialized here. */
12867 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12870 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12871 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 len--;
12873 }
12874 else if (flags & F_SIGN)
12875 sign = '+';
12876 else if (flags & F_BLANK)
12877 sign = ' ';
12878 else
12879 sign = 0;
12880 }
12881 if (width < len)
12882 width = len;
12883 if (rescnt - (sign != 0) < width) {
12884 reslen -= rescnt;
12885 rescnt = width + fmtcnt + 100;
12886 reslen += rescnt;
12887 if (reslen < 0) {
12888 Py_XDECREF(temp);
12889 PyErr_NoMemory();
12890 goto onError;
12891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12893 if (res0 == 0) {
12894 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012895 Py_XDECREF(temp);
12896 goto onError;
12897 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012899 }
12900 if (sign) {
12901 if (fill != ' ')
12902 *res++ = sign;
12903 rescnt--;
12904 if (width > len)
12905 width--;
12906 }
12907 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12909 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012910 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12912 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012913 }
12914 rescnt -= 2;
12915 width -= 2;
12916 if (width < 0)
12917 width = 0;
12918 len -= 2;
12919 }
12920 if (width > len && !(flags & F_LJUST)) {
12921 do {
12922 --rescnt;
12923 *res++ = fill;
12924 } while (--width > len);
12925 }
12926 if (fill == ' ') {
12927 if (sign)
12928 *res++ = sign;
12929 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12931 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12932 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12933 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012934 }
12935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 /* Copy all characters, preserving len */
12937 len1 = len;
12938 while (len1--) {
12939 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12940 rescnt--;
12941 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 while (--width >= len) {
12943 --rescnt;
12944 *res++ = ' ';
12945 }
12946 if (dict && (argidx < arglen) && c != '%') {
12947 PyErr_SetString(PyExc_TypeError,
12948 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012949 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 goto onError;
12951 }
12952 Py_XDECREF(temp);
12953 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954 } /* until end */
12955 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012956 PyErr_SetString(PyExc_TypeError,
12957 "not all arguments converted during string formatting");
12958 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959 }
12960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961
12962 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12963 if (*res > max)
12964 max = *res;
12965 result = PyUnicode_New(reslen - rescnt, max);
12966 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012967 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 kind = PyUnicode_KIND(result);
12969 for (res = res0; res < res0+reslen-rescnt; res++)
12970 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12971 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012972 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974 }
12975 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976 return (PyObject *)result;
12977
Benjamin Peterson29060642009-01-31 22:14:21 +000012978 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980 Py_DECREF(uformat);
12981 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012982 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983 }
12984 return NULL;
12985}
12986
Jeremy Hylton938ace62002-07-17 16:30:39 +000012987static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012988unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12989
Tim Peters6d6c1a32001-08-02 04:15:00 +000012990static PyObject *
12991unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12992{
Benjamin Peterson29060642009-01-31 22:14:21 +000012993 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012994 static char *kwlist[] = {"object", "encoding", "errors", 0};
12995 char *encoding = NULL;
12996 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012997
Benjamin Peterson14339b62009-01-31 16:36:08 +000012998 if (type != &PyUnicode_Type)
12999 return unicode_subtype_new(type, args, kwds);
13000 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013002 return NULL;
13003 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013005 if (encoding == NULL && errors == NULL)
13006 return PyObject_Str(x);
13007 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013008 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013009}
13010
Guido van Rossume023fe02001-08-30 03:12:59 +000013011static PyObject *
13012unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13013{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013014 PyUnicodeObject *unicode, *self;
13015 Py_ssize_t length, char_size;
13016 int share_wstr, share_utf8;
13017 unsigned int kind;
13018 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013019
Benjamin Peterson14339b62009-01-31 16:36:08 +000013020 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013021
13022 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13023 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013024 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013025 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013026 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013027 return NULL;
13028
13029 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13030 if (self == NULL) {
13031 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013032 return NULL;
13033 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013034 kind = PyUnicode_KIND(unicode);
13035 length = PyUnicode_GET_LENGTH(unicode);
13036
13037 _PyUnicode_LENGTH(self) = length;
13038 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13039 _PyUnicode_STATE(self).interned = 0;
13040 _PyUnicode_STATE(self).kind = kind;
13041 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013042 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013043 _PyUnicode_STATE(self).ready = 1;
13044 _PyUnicode_WSTR(self) = NULL;
13045 _PyUnicode_UTF8_LENGTH(self) = 0;
13046 _PyUnicode_UTF8(self) = NULL;
13047 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013048 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013049
13050 share_utf8 = 0;
13051 share_wstr = 0;
13052 if (kind == PyUnicode_1BYTE_KIND) {
13053 char_size = 1;
13054 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13055 share_utf8 = 1;
13056 }
13057 else if (kind == PyUnicode_2BYTE_KIND) {
13058 char_size = 2;
13059 if (sizeof(wchar_t) == 2)
13060 share_wstr = 1;
13061 }
13062 else {
13063 assert(kind == PyUnicode_4BYTE_KIND);
13064 char_size = 4;
13065 if (sizeof(wchar_t) == 4)
13066 share_wstr = 1;
13067 }
13068
13069 /* Ensure we won't overflow the length. */
13070 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13071 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013073 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013074 data = PyObject_MALLOC((length + 1) * char_size);
13075 if (data == NULL) {
13076 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 goto onError;
13078 }
13079
Victor Stinnerc3c74152011-10-02 20:39:55 +020013080 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013081 if (share_utf8) {
13082 _PyUnicode_UTF8_LENGTH(self) = length;
13083 _PyUnicode_UTF8(self) = data;
13084 }
13085 if (share_wstr) {
13086 _PyUnicode_WSTR_LENGTH(self) = length;
13087 _PyUnicode_WSTR(self) = (wchar_t *)data;
13088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013089
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013090 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13091 PyUnicode_KIND_SIZE(kind, length + 1));
13092 Py_DECREF(unicode);
13093 return (PyObject *)self;
13094
13095onError:
13096 Py_DECREF(unicode);
13097 Py_DECREF(self);
13098 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013099}
13100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013101PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013102 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013103\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013104Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013105encoding defaults to the current default string encoding.\n\
13106errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013107
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013108static PyObject *unicode_iter(PyObject *seq);
13109
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013111 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013112 "str", /* tp_name */
13113 sizeof(PyUnicodeObject), /* tp_size */
13114 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013116 (destructor)unicode_dealloc, /* tp_dealloc */
13117 0, /* tp_print */
13118 0, /* tp_getattr */
13119 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013120 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013121 unicode_repr, /* tp_repr */
13122 &unicode_as_number, /* tp_as_number */
13123 &unicode_as_sequence, /* tp_as_sequence */
13124 &unicode_as_mapping, /* tp_as_mapping */
13125 (hashfunc) unicode_hash, /* tp_hash*/
13126 0, /* tp_call*/
13127 (reprfunc) unicode_str, /* tp_str */
13128 PyObject_GenericGetAttr, /* tp_getattro */
13129 0, /* tp_setattro */
13130 0, /* tp_as_buffer */
13131 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013133 unicode_doc, /* tp_doc */
13134 0, /* tp_traverse */
13135 0, /* tp_clear */
13136 PyUnicode_RichCompare, /* tp_richcompare */
13137 0, /* tp_weaklistoffset */
13138 unicode_iter, /* tp_iter */
13139 0, /* tp_iternext */
13140 unicode_methods, /* tp_methods */
13141 0, /* tp_members */
13142 0, /* tp_getset */
13143 &PyBaseObject_Type, /* tp_base */
13144 0, /* tp_dict */
13145 0, /* tp_descr_get */
13146 0, /* tp_descr_set */
13147 0, /* tp_dictoffset */
13148 0, /* tp_init */
13149 0, /* tp_alloc */
13150 unicode_new, /* tp_new */
13151 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152};
13153
13154/* Initialize the Unicode implementation */
13155
Thomas Wouters78890102000-07-22 19:25:51 +000013156void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013158 int i;
13159
Thomas Wouters477c8d52006-05-27 19:21:47 +000013160 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013162 0x000A, /* LINE FEED */
13163 0x000D, /* CARRIAGE RETURN */
13164 0x001C, /* FILE SEPARATOR */
13165 0x001D, /* GROUP SEPARATOR */
13166 0x001E, /* RECORD SEPARATOR */
13167 0x0085, /* NEXT LINE */
13168 0x2028, /* LINE SEPARATOR */
13169 0x2029, /* PARAGRAPH SEPARATOR */
13170 };
13171
Fred Drakee4315f52000-05-09 19:53:39 +000013172 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013173 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013174 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013177 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013179 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013181
13182 /* initialize the linebreak bloom filter */
13183 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013185 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013186
13187 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188}
13189
13190/* Finalize the Unicode implementation */
13191
Christian Heimesa156e092008-02-16 07:38:31 +000013192int
13193PyUnicode_ClearFreeList(void)
13194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013196}
13197
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198void
Thomas Wouters78890102000-07-22 19:25:51 +000013199_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013201 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013203 Py_XDECREF(unicode_empty);
13204 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013205
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013206 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013207 if (unicode_latin1[i]) {
13208 Py_DECREF(unicode_latin1[i]);
13209 unicode_latin1[i] = NULL;
13210 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013211 }
Christian Heimesa156e092008-02-16 07:38:31 +000013212 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013214
Walter Dörwald16807132007-05-25 13:52:07 +000013215void
13216PyUnicode_InternInPlace(PyObject **p)
13217{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013218 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13219 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013220#ifdef Py_DEBUG
13221 assert(s != NULL);
13222 assert(_PyUnicode_CHECK(s));
13223#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013224 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013225 return;
13226#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013227 /* If it's a subclass, we don't really know what putting
13228 it in the interned dict might do. */
13229 if (!PyUnicode_CheckExact(s))
13230 return;
13231 if (PyUnicode_CHECK_INTERNED(s))
13232 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013233 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013234 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 return;
13236 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013237 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013238 if (interned == NULL) {
13239 interned = PyDict_New();
13240 if (interned == NULL) {
13241 PyErr_Clear(); /* Don't leave an exception */
13242 return;
13243 }
13244 }
13245 /* It might be that the GetItem call fails even
13246 though the key is present in the dictionary,
13247 namely when this happens during a stack overflow. */
13248 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013250 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013251
Benjamin Peterson29060642009-01-31 22:14:21 +000013252 if (t) {
13253 Py_INCREF(t);
13254 Py_DECREF(*p);
13255 *p = t;
13256 return;
13257 }
Walter Dörwald16807132007-05-25 13:52:07 +000013258
Benjamin Peterson14339b62009-01-31 16:36:08 +000013259 PyThreadState_GET()->recursion_critical = 1;
13260 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13261 PyErr_Clear();
13262 PyThreadState_GET()->recursion_critical = 0;
13263 return;
13264 }
13265 PyThreadState_GET()->recursion_critical = 0;
13266 /* The two references in interned are not counted by refcnt.
13267 The deallocator will take care of this */
13268 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013270}
13271
13272void
13273PyUnicode_InternImmortal(PyObject **p)
13274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13276
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 PyUnicode_InternInPlace(p);
13278 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013280 Py_INCREF(*p);
13281 }
Walter Dörwald16807132007-05-25 13:52:07 +000013282}
13283
13284PyObject *
13285PyUnicode_InternFromString(const char *cp)
13286{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013287 PyObject *s = PyUnicode_FromString(cp);
13288 if (s == NULL)
13289 return NULL;
13290 PyUnicode_InternInPlace(&s);
13291 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013292}
13293
Alexander Belopolsky40018472011-02-26 01:02:56 +000013294void
13295_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013296{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013297 PyObject *keys;
13298 PyUnicodeObject *s;
13299 Py_ssize_t i, n;
13300 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013301
Benjamin Peterson14339b62009-01-31 16:36:08 +000013302 if (interned == NULL || !PyDict_Check(interned))
13303 return;
13304 keys = PyDict_Keys(interned);
13305 if (keys == NULL || !PyList_Check(keys)) {
13306 PyErr_Clear();
13307 return;
13308 }
Walter Dörwald16807132007-05-25 13:52:07 +000013309
Benjamin Peterson14339b62009-01-31 16:36:08 +000013310 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13311 detector, interned unicode strings are not forcibly deallocated;
13312 rather, we give them their stolen references back, and then clear
13313 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013314
Benjamin Peterson14339b62009-01-31 16:36:08 +000013315 n = PyList_GET_SIZE(keys);
13316 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013318 for (i = 0; i < n; i++) {
13319 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013320 if (PyUnicode_READY(s) == -1) {
13321 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013324 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013325 case SSTATE_NOT_INTERNED:
13326 /* XXX Shouldn't happen */
13327 break;
13328 case SSTATE_INTERNED_IMMORTAL:
13329 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013331 break;
13332 case SSTATE_INTERNED_MORTAL:
13333 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013335 break;
13336 default:
13337 Py_FatalError("Inconsistent interned string state.");
13338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013340 }
13341 fprintf(stderr, "total size of all interned strings: "
13342 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13343 "mortal/immortal\n", mortal_size, immortal_size);
13344 Py_DECREF(keys);
13345 PyDict_Clear(interned);
13346 Py_DECREF(interned);
13347 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013348}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013349
13350
13351/********************* Unicode Iterator **************************/
13352
13353typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013354 PyObject_HEAD
13355 Py_ssize_t it_index;
13356 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013357} unicodeiterobject;
13358
13359static void
13360unicodeiter_dealloc(unicodeiterobject *it)
13361{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013362 _PyObject_GC_UNTRACK(it);
13363 Py_XDECREF(it->it_seq);
13364 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013365}
13366
13367static int
13368unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13369{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013370 Py_VISIT(it->it_seq);
13371 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013372}
13373
13374static PyObject *
13375unicodeiter_next(unicodeiterobject *it)
13376{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013377 PyUnicodeObject *seq;
13378 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013379
Benjamin Peterson14339b62009-01-31 16:36:08 +000013380 assert(it != NULL);
13381 seq = it->it_seq;
13382 if (seq == NULL)
13383 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013384 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13387 int kind = PyUnicode_KIND(seq);
13388 void *data = PyUnicode_DATA(seq);
13389 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13390 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013391 if (item != NULL)
13392 ++it->it_index;
13393 return item;
13394 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013395
Benjamin Peterson14339b62009-01-31 16:36:08 +000013396 Py_DECREF(seq);
13397 it->it_seq = NULL;
13398 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013399}
13400
13401static PyObject *
13402unicodeiter_len(unicodeiterobject *it)
13403{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013404 Py_ssize_t len = 0;
13405 if (it->it_seq)
13406 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13407 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013408}
13409
13410PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13411
13412static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013413 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013415 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013416};
13417
13418PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013419 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13420 "str_iterator", /* tp_name */
13421 sizeof(unicodeiterobject), /* tp_basicsize */
13422 0, /* tp_itemsize */
13423 /* methods */
13424 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13425 0, /* tp_print */
13426 0, /* tp_getattr */
13427 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013428 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013429 0, /* tp_repr */
13430 0, /* tp_as_number */
13431 0, /* tp_as_sequence */
13432 0, /* tp_as_mapping */
13433 0, /* tp_hash */
13434 0, /* tp_call */
13435 0, /* tp_str */
13436 PyObject_GenericGetAttr, /* tp_getattro */
13437 0, /* tp_setattro */
13438 0, /* tp_as_buffer */
13439 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13440 0, /* tp_doc */
13441 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13442 0, /* tp_clear */
13443 0, /* tp_richcompare */
13444 0, /* tp_weaklistoffset */
13445 PyObject_SelfIter, /* tp_iter */
13446 (iternextfunc)unicodeiter_next, /* tp_iternext */
13447 unicodeiter_methods, /* tp_methods */
13448 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013449};
13450
13451static PyObject *
13452unicode_iter(PyObject *seq)
13453{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013454 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013455
Benjamin Peterson14339b62009-01-31 16:36:08 +000013456 if (!PyUnicode_Check(seq)) {
13457 PyErr_BadInternalCall();
13458 return NULL;
13459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013460 if (PyUnicode_READY(seq) == -1)
13461 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013462 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13463 if (it == NULL)
13464 return NULL;
13465 it->it_index = 0;
13466 Py_INCREF(seq);
13467 it->it_seq = (PyUnicodeObject *)seq;
13468 _PyObject_GC_TRACK(it);
13469 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013470}
13471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472#define UNIOP(x) Py_UNICODE_##x
13473#define UNIOP_t Py_UNICODE
13474#include "uniops.h"
13475#undef UNIOP
13476#undef UNIOP_t
13477#define UNIOP(x) Py_UCS4_##x
13478#define UNIOP_t Py_UCS4
13479#include "uniops.h"
13480#undef UNIOP
13481#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013482
Victor Stinner71133ff2010-09-01 23:43:53 +000013483Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013484PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013485{
13486 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13487 Py_UNICODE *copy;
13488 Py_ssize_t size;
13489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013490 if (!PyUnicode_Check(unicode)) {
13491 PyErr_BadArgument();
13492 return NULL;
13493 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013494 /* Ensure we won't overflow the size. */
13495 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13496 PyErr_NoMemory();
13497 return NULL;
13498 }
13499 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13500 size *= sizeof(Py_UNICODE);
13501 copy = PyMem_Malloc(size);
13502 if (copy == NULL) {
13503 PyErr_NoMemory();
13504 return NULL;
13505 }
13506 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13507 return copy;
13508}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013509
Georg Brandl66c221e2010-10-14 07:04:07 +000013510/* A _string module, to export formatter_parser and formatter_field_name_split
13511 to the string.Formatter class implemented in Python. */
13512
13513static PyMethodDef _string_methods[] = {
13514 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13515 METH_O, PyDoc_STR("split the argument as a field name")},
13516 {"formatter_parser", (PyCFunction) formatter_parser,
13517 METH_O, PyDoc_STR("parse the argument as a format string")},
13518 {NULL, NULL}
13519};
13520
13521static struct PyModuleDef _string_module = {
13522 PyModuleDef_HEAD_INIT,
13523 "_string",
13524 PyDoc_STR("string helper module"),
13525 0,
13526 _string_methods,
13527 NULL,
13528 NULL,
13529 NULL,
13530 NULL
13531};
13532
13533PyMODINIT_FUNC
13534PyInit__string(void)
13535{
13536 return PyModule_Create(&_string_module);
13537}
13538
13539
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013540#ifdef __cplusplus
13541}
13542#endif