blob: 234bd01b769b1dfc7dc6c8b073be8db169006328 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200242static void copy_characters(
243 PyObject *to, Py_ssize_t to_start,
244 PyObject *from, Py_ssize_t from_start,
245 Py_ssize_t how_many);
246static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200247
Alexander Belopolsky40018472011-02-26 01:02:56 +0000248static PyObject *
249unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000250 PyObject **errorHandler,const char *encoding, const char *reason,
251 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static void
255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300256 const char *encoding,
257 const Py_UNICODE *unicode, Py_ssize_t size,
258 Py_ssize_t startpos, Py_ssize_t endpos,
259 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000260
Christian Heimes190d79e2008-01-30 11:58:22 +0000261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000264/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000265/* 0x000B, * LINE TABULATION */
266/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000267/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000270/* 0x001C, * FILE SEPARATOR */
271/* 0x001D, * GROUP SEPARATOR */
272/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 1, 1, 1, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000278
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000287};
288
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000292PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000294#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 /* This is actually an illegal character, so it should
298 not be passed to unichr. */
299 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#endif
301}
302
Victor Stinner910337b2011-10-03 03:20:16 +0200303#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200304int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200305/* FIXME: use PyObject* type for op */
306_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200307{
308 PyASCIIObject *ascii;
309 unsigned int kind;
310
311 assert(PyUnicode_Check(op));
312
313 ascii = (PyASCIIObject *)op;
314 kind = ascii->state.kind;
315
Victor Stinnera3b334d2011-10-03 13:53:37 +0200316 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200318 assert(ascii->state.ready == 1);
319 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200321 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200322 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200323
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 if (ascii->state.compact == 1) {
325 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200326 assert(kind == PyUnicode_1BYTE_KIND
327 || kind == PyUnicode_2BYTE_KIND
328 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200330 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert (compact->utf8 != data);
332 } else {
333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335 data = unicode->data.any;
336 if (kind == PyUnicode_WCHAR_KIND) {
337 assert(ascii->state.compact == 0);
338 assert(ascii->state.ascii == 0);
339 assert(ascii->state.ready == 0);
340 assert(ascii->wstr != NULL);
341 assert(data == NULL);
342 assert(compact->utf8 == NULL);
343 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
344 }
345 else {
346 assert(kind == PyUnicode_1BYTE_KIND
347 || kind == PyUnicode_2BYTE_KIND
348 || kind == PyUnicode_4BYTE_KIND);
349 assert(ascii->state.compact == 0);
350 assert(ascii->state.ready == 1);
351 assert(data != NULL);
352 if (ascii->state.ascii) {
353 assert (compact->utf8 == data);
354 assert (compact->utf8_length == ascii->length);
355 }
356 else
357 assert (compact->utf8 != data);
358 }
359 }
360 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200361 if (
362#if SIZEOF_WCHAR_T == 2
363 kind == PyUnicode_2BYTE_KIND
364#else
365 kind == PyUnicode_4BYTE_KIND
366#endif
367 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200368 {
369 assert(ascii->wstr == data);
370 assert(compact->wstr_length == ascii->length);
371 } else
372 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200373 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200374
375 if (compact->utf8 == NULL)
376 assert(compact->utf8_length == 0);
377 if (ascii->wstr == NULL)
378 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200380 /* check that the best kind is used */
381 if (check_content && kind != PyUnicode_WCHAR_KIND)
382 {
383 Py_ssize_t i;
384 Py_UCS4 maxchar = 0;
385 void *data = PyUnicode_DATA(ascii);
386 for (i=0; i < ascii->length; i++)
387 {
388 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
389 if (ch > maxchar)
390 maxchar = ch;
391 }
392 if (kind == PyUnicode_1BYTE_KIND) {
393 if (ascii->state.ascii == 0)
394 assert(maxchar >= 128);
395 else
396 assert(maxchar < 128);
397 }
398 else if (kind == PyUnicode_2BYTE_KIND)
399 assert(maxchar >= 0x100);
400 else
401 assert(maxchar >= 0x10000);
402 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200403 if (check_content && !unicode_is_singleton((PyObject*)ascii))
404 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Thomas Wouters477c8d52006-05-27 19:21:47 +0000409/* --- Bloom Filters ----------------------------------------------------- */
410
411/* stuff to implement simple "bloom filters" for Unicode characters.
412 to keep things simple, we use a single bitmask, using the least 5
413 bits from each unicode characters as the bit index. */
414
415/* the linebreak mask is set up by Unicode_Init below */
416
Antoine Pitrouf068f942010-01-13 14:19:12 +0000417#if LONG_BIT >= 128
418#define BLOOM_WIDTH 128
419#elif LONG_BIT >= 64
420#define BLOOM_WIDTH 64
421#elif LONG_BIT >= 32
422#define BLOOM_WIDTH 32
423#else
424#error "LONG_BIT is smaller than 32"
425#endif
426
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427#define BLOOM_MASK unsigned long
428
429static BLOOM_MASK bloom_linebreak;
430
Antoine Pitrouf068f942010-01-13 14:19:12 +0000431#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
432#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000433
Benjamin Peterson29060642009-01-31 22:14:21 +0000434#define BLOOM_LINEBREAK(ch) \
435 ((ch) < 128U ? ascii_linebreak[(ch)] : \
436 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437
Alexander Belopolsky40018472011-02-26 01:02:56 +0000438Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200439make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000440{
441 /* calculate simple bloom-style bitmask for a given unicode string */
442
Antoine Pitrouf068f942010-01-13 14:19:12 +0000443 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000444 Py_ssize_t i;
445
446 mask = 0;
447 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200448 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449
450 return mask;
451}
452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200453#define BLOOM_MEMBER(mask, chr, str) \
454 (BLOOM(mask, chr) \
455 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457/* --- Unicode Object ----------------------------------------------------- */
458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200459static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200460fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461
462Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
463 Py_ssize_t size, Py_UCS4 ch,
464 int direction)
465{
466 /* like wcschr, but doesn't stop at NULL characters */
467 Py_ssize_t i;
468 if (direction == 1) {
469 for(i = 0; i < size; i++)
470 if (PyUnicode_READ(kind, s, i) == ch)
471 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
472 }
473 else {
474 for(i = size-1; i >= 0; i--)
475 if (PyUnicode_READ(kind, s, i) == ch)
476 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
477 }
478 return NULL;
479}
480
Victor Stinnerfe226c02011-10-03 03:52:20 +0200481static PyObject*
482resize_compact(PyObject *unicode, Py_ssize_t length)
483{
484 Py_ssize_t char_size;
485 Py_ssize_t struct_size;
486 Py_ssize_t new_size;
487 int share_wstr;
488
489 assert(PyUnicode_IS_READY(unicode));
490 char_size = PyUnicode_CHARACTER_SIZE(unicode);
491 if (PyUnicode_IS_COMPACT_ASCII(unicode))
492 struct_size = sizeof(PyASCIIObject);
493 else
494 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200495 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200496
497 _Py_DEC_REFTOTAL;
498 _Py_ForgetReference(unicode);
499
500 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
501 PyErr_NoMemory();
502 return NULL;
503 }
504 new_size = (struct_size + (length + 1) * char_size);
505
506 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
507 if (unicode == NULL) {
508 PyObject_Del(unicode);
509 PyErr_NoMemory();
510 return NULL;
511 }
512 _Py_NewReference(unicode);
513 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200514 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200515 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200516 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
517 _PyUnicode_WSTR_LENGTH(unicode) = length;
518 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200519 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
520 length, 0);
521 return unicode;
522}
523
Alexander Belopolsky40018472011-02-26 01:02:56 +0000524static int
Victor Stinner95663112011-10-04 01:03:50 +0200525resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000526{
Victor Stinner95663112011-10-04 01:03:50 +0200527 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200528 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200529 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000530
Victor Stinner95663112011-10-04 01:03:50 +0200531 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200532
533 if (PyUnicode_IS_READY(unicode)) {
534 Py_ssize_t char_size;
535 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200536 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200537 void *data;
538
539 data = _PyUnicode_DATA_ANY(unicode);
540 assert(data != NULL);
541 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200542 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
543 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
545 {
546 PyObject_DEL(_PyUnicode_UTF8(unicode));
547 _PyUnicode_UTF8(unicode) = NULL;
548 _PyUnicode_UTF8_LENGTH(unicode) = 0;
549 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200550
551 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
552 PyErr_NoMemory();
553 return -1;
554 }
555 new_size = (length + 1) * char_size;
556
557 data = (PyObject *)PyObject_REALLOC(data, new_size);
558 if (data == NULL) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200563 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200565 _PyUnicode_WSTR_LENGTH(unicode) = length;
566 }
567 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200568 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200569 _PyUnicode_UTF8_LENGTH(unicode) = length;
570 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200571 _PyUnicode_LENGTH(unicode) = length;
572 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200573 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200574 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200575 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200576 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 }
Victor Stinner95663112011-10-04 01:03:50 +0200578 assert(_PyUnicode_WSTR(unicode) != NULL);
579
580 /* check for integer overflow */
581 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
582 PyErr_NoMemory();
583 return -1;
584 }
585 wstr = _PyUnicode_WSTR(unicode);
586 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
587 if (!wstr) {
588 PyErr_NoMemory();
589 return -1;
590 }
591 _PyUnicode_WSTR(unicode) = wstr;
592 _PyUnicode_WSTR(unicode)[length] = 0;
593 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000595 return 0;
596}
597
Victor Stinnerfe226c02011-10-03 03:52:20 +0200598static PyObject*
599resize_copy(PyObject *unicode, Py_ssize_t length)
600{
601 Py_ssize_t copy_length;
602 if (PyUnicode_IS_COMPACT(unicode)) {
603 PyObject *copy;
604 assert(PyUnicode_IS_READY(unicode));
605
606 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
607 if (copy == NULL)
608 return NULL;
609
610 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200611 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200612 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200613 }
614 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200615 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200616 assert(_PyUnicode_WSTR(unicode) != NULL);
617 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200618 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200619 if (w == NULL)
620 return NULL;
621 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
622 copy_length = Py_MIN(copy_length, length);
623 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
624 copy_length);
625 return (PyObject*)w;
626 }
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000630 Ux0000 terminated; some code (e.g. new_identifier)
631 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632
633 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000634 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635
636*/
637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638#ifdef Py_DEBUG
639int unicode_old_new_calls = 0;
640#endif
641
Alexander Belopolsky40018472011-02-26 01:02:56 +0000642static PyUnicodeObject *
643_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644{
645 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200646 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647
Thomas Wouters477c8d52006-05-27 19:21:47 +0000648 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649 if (length == 0 && unicode_empty != NULL) {
650 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200651 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652 }
653
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000654 /* Ensure we won't overflow the size. */
655 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
656 return (PyUnicodeObject *)PyErr_NoMemory();
657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200658 if (length < 0) {
659 PyErr_SetString(PyExc_SystemError,
660 "Negative size passed to _PyUnicode_New");
661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 }
663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664#ifdef Py_DEBUG
665 ++unicode_old_new_calls;
666#endif
667
668 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
669 if (unicode == NULL)
670 return NULL;
671 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
672 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
673 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000674 PyErr_NoMemory();
675 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677
Jeremy Hyltond8082792003-09-16 19:41:39 +0000678 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000679 * the caller fails before initializing str -- unicode_resize()
680 * reads str[0], and the Keep-Alive optimization can keep memory
681 * allocated for str alive across a call to unicode_dealloc(unicode).
682 * We don't want unicode_resize to read uninitialized memory in
683 * that case.
684 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200685 _PyUnicode_WSTR(unicode)[0] = 0;
686 _PyUnicode_WSTR(unicode)[length] = 0;
687 _PyUnicode_WSTR_LENGTH(unicode) = length;
688 _PyUnicode_HASH(unicode) = -1;
689 _PyUnicode_STATE(unicode).interned = 0;
690 _PyUnicode_STATE(unicode).kind = 0;
691 _PyUnicode_STATE(unicode).compact = 0;
692 _PyUnicode_STATE(unicode).ready = 0;
693 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200694 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200695 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200696 _PyUnicode_UTF8(unicode) = NULL;
697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000698 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000699
Benjamin Peterson29060642009-01-31 22:14:21 +0000700 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000701 /* XXX UNREF/NEWREF interface should be more symmetrical */
702 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000703 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000704 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000705 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706}
707
Victor Stinnerf42dc442011-10-02 23:33:16 +0200708static const char*
709unicode_kind_name(PyObject *unicode)
710{
Victor Stinner42dfd712011-10-03 14:41:45 +0200711 /* don't check consistency: unicode_kind_name() is called from
712 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200713 if (!PyUnicode_IS_COMPACT(unicode))
714 {
715 if (!PyUnicode_IS_READY(unicode))
716 return "wstr";
717 switch(PyUnicode_KIND(unicode))
718 {
719 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 return "legacy ascii";
722 else
723 return "legacy latin1";
724 case PyUnicode_2BYTE_KIND:
725 return "legacy UCS2";
726 case PyUnicode_4BYTE_KIND:
727 return "legacy UCS4";
728 default:
729 return "<legacy invalid kind>";
730 }
731 }
732 assert(PyUnicode_IS_READY(unicode));
733 switch(PyUnicode_KIND(unicode))
734 {
735 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200736 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200737 return "ascii";
738 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200739 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200740 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200742 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200743 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200744 default:
745 return "<invalid compact kind>";
746 }
747}
748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200749#ifdef Py_DEBUG
750int unicode_new_new_calls = 0;
751
752/* Functions wrapping macros for use in debugger */
753char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200754 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200755}
756
757void *_PyUnicode_compact_data(void *unicode) {
758 return _PyUnicode_COMPACT_DATA(unicode);
759}
760void *_PyUnicode_data(void *unicode){
761 printf("obj %p\n", unicode);
762 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
763 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
764 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
765 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
766 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
767 return PyUnicode_DATA(unicode);
768}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769
770void
771_PyUnicode_Dump(PyObject *op)
772{
773 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200774 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
775 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
776 void *data;
777 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
778 if (ascii->state.compact)
779 data = (compact + 1);
780 else
781 data = unicode->data.any;
782 if (ascii->wstr == data)
783 printf("shared ");
784 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200785 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200786 printf(" (%zu), ", compact->wstr_length);
787 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
788 printf("shared ");
789 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200790 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200791 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200792}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793#endif
794
795PyObject *
796PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
797{
798 PyObject *obj;
799 PyCompactUnicodeObject *unicode;
800 void *data;
801 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200802 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803 Py_ssize_t char_size;
804 Py_ssize_t struct_size;
805
806 /* Optimization for empty strings */
807 if (size == 0 && unicode_empty != NULL) {
808 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200809 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810 }
811
812#ifdef Py_DEBUG
813 ++unicode_new_new_calls;
814#endif
815
Victor Stinner9e9d6892011-10-04 01:02:02 +0200816 is_ascii = 0;
817 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818 struct_size = sizeof(PyCompactUnicodeObject);
819 if (maxchar < 128) {
820 kind_state = PyUnicode_1BYTE_KIND;
821 char_size = 1;
822 is_ascii = 1;
823 struct_size = sizeof(PyASCIIObject);
824 }
825 else if (maxchar < 256) {
826 kind_state = PyUnicode_1BYTE_KIND;
827 char_size = 1;
828 }
829 else if (maxchar < 65536) {
830 kind_state = PyUnicode_2BYTE_KIND;
831 char_size = 2;
832 if (sizeof(wchar_t) == 2)
833 is_sharing = 1;
834 }
835 else {
836 kind_state = PyUnicode_4BYTE_KIND;
837 char_size = 4;
838 if (sizeof(wchar_t) == 4)
839 is_sharing = 1;
840 }
841
842 /* Ensure we won't overflow the size. */
843 if (size < 0) {
844 PyErr_SetString(PyExc_SystemError,
845 "Negative size passed to PyUnicode_New");
846 return NULL;
847 }
848 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
849 return PyErr_NoMemory();
850
851 /* Duplicated allocation code from _PyObject_New() instead of a call to
852 * PyObject_New() so we are able to allocate space for the object and
853 * it's data buffer.
854 */
855 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
856 if (obj == NULL)
857 return PyErr_NoMemory();
858 obj = PyObject_INIT(obj, &PyUnicode_Type);
859 if (obj == NULL)
860 return NULL;
861
862 unicode = (PyCompactUnicodeObject *)obj;
863 if (is_ascii)
864 data = ((PyASCIIObject*)obj) + 1;
865 else
866 data = unicode + 1;
867 _PyUnicode_LENGTH(unicode) = size;
868 _PyUnicode_HASH(unicode) = -1;
869 _PyUnicode_STATE(unicode).interned = 0;
870 _PyUnicode_STATE(unicode).kind = kind_state;
871 _PyUnicode_STATE(unicode).compact = 1;
872 _PyUnicode_STATE(unicode).ready = 1;
873 _PyUnicode_STATE(unicode).ascii = is_ascii;
874 if (is_ascii) {
875 ((char*)data)[size] = 0;
876 _PyUnicode_WSTR(unicode) = NULL;
877 }
878 else if (kind_state == PyUnicode_1BYTE_KIND) {
879 ((char*)data)[size] = 0;
880 _PyUnicode_WSTR(unicode) = NULL;
881 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200883 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 }
885 else {
886 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200887 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 if (kind_state == PyUnicode_2BYTE_KIND)
889 ((Py_UCS2*)data)[size] = 0;
890 else /* kind_state == PyUnicode_4BYTE_KIND */
891 ((Py_UCS4*)data)[size] = 0;
892 if (is_sharing) {
893 _PyUnicode_WSTR_LENGTH(unicode) = size;
894 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
895 }
896 else {
897 _PyUnicode_WSTR_LENGTH(unicode) = 0;
898 _PyUnicode_WSTR(unicode) = NULL;
899 }
900 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200901 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 return obj;
903}
904
905#if SIZEOF_WCHAR_T == 2
906/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
907 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200908 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909
910 This function assumes that unicode can hold one more code point than wstr
911 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200912static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
914 PyUnicodeObject *unicode)
915{
916 const wchar_t *iter;
917 Py_UCS4 *ucs4_out;
918
Victor Stinner910337b2011-10-03 03:20:16 +0200919 assert(unicode != NULL);
920 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
922 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
923
924 for (iter = begin; iter < end; ) {
925 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
926 _PyUnicode_GET_LENGTH(unicode)));
927 if (*iter >= 0xD800 && *iter <= 0xDBFF
928 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
929 {
930 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
931 iter += 2;
932 }
933 else {
934 *ucs4_out++ = *iter;
935 iter++;
936 }
937 }
938 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
939 _PyUnicode_GET_LENGTH(unicode)));
940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200941}
942#endif
943
Victor Stinnercd9950f2011-10-02 00:34:53 +0200944static int
945_PyUnicode_Dirty(PyObject *unicode)
946{
Victor Stinner910337b2011-10-03 03:20:16 +0200947 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200948 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200949 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200950 "Cannot modify a string having more than 1 reference");
951 return -1;
952 }
953 _PyUnicode_DIRTY(unicode);
954 return 0;
955}
956
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200957static int
958_copy_characters(PyObject *to, Py_ssize_t to_start,
959 PyObject *from, Py_ssize_t from_start,
960 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200962 unsigned int from_kind, to_kind;
963 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200964 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200966 assert(PyUnicode_Check(from));
967 assert(PyUnicode_Check(to));
968 assert(PyUnicode_IS_READY(from));
969 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200971 assert(PyUnicode_GET_LENGTH(from) >= how_many);
972 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
973 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200975 if (how_many == 0)
976 return 0;
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200979 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200981 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200983#ifdef Py_DEBUG
984 if (!check_maxchar
985 && (from_kind > to_kind
986 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200987 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200988 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
989 Py_UCS4 ch;
990 Py_ssize_t i;
991 for (i=0; i < how_many; i++) {
992 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
993 assert(ch <= to_maxchar);
994 }
995 }
996#endif
997 fast = (from_kind == to_kind);
998 if (check_maxchar
999 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1000 {
1001 /* deny latin1 => ascii */
1002 fast = 0;
1003 }
1004
1005 if (fast) {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001006 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001007 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +02001008 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 + PyUnicode_KIND_SIZE(from_kind, from_start),
1010 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001012 else if (from_kind == PyUnicode_1BYTE_KIND
1013 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001014 {
1015 _PyUnicode_CONVERT_BYTES(
1016 Py_UCS1, Py_UCS2,
1017 PyUnicode_1BYTE_DATA(from) + from_start,
1018 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1019 PyUnicode_2BYTE_DATA(to) + to_start
1020 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001021 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001022 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001023 && to_kind == PyUnicode_4BYTE_KIND)
1024 {
1025 _PyUnicode_CONVERT_BYTES(
1026 Py_UCS1, Py_UCS4,
1027 PyUnicode_1BYTE_DATA(from) + from_start,
1028 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1029 PyUnicode_4BYTE_DATA(to) + to_start
1030 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001031 }
1032 else if (from_kind == PyUnicode_2BYTE_KIND
1033 && to_kind == PyUnicode_4BYTE_KIND)
1034 {
1035 _PyUnicode_CONVERT_BYTES(
1036 Py_UCS2, Py_UCS4,
1037 PyUnicode_2BYTE_DATA(from) + from_start,
1038 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1039 PyUnicode_4BYTE_DATA(to) + to_start
1040 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001041 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001042 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001043 /* check if max_char(from substring) <= max_char(to) */
1044 if (from_kind > to_kind
1045 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001046 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001047 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001048 /* slow path to check for character overflow */
1049 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001050 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001051 Py_ssize_t i;
1052
Victor Stinnera0702ab2011-09-29 14:14:38 +02001053 for (i=0; i < how_many; i++) {
1054 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001055 if (check_maxchar) {
1056 if (ch > to_maxchar)
1057 return 1;
1058 }
1059 else {
1060 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001061 }
1062 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1063 }
1064 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001065 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001066 return -1;
1067 }
1068 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001069 return 0;
1070}
1071
1072static void
1073copy_characters(PyObject *to, Py_ssize_t to_start,
1074 PyObject *from, Py_ssize_t from_start,
1075 Py_ssize_t how_many)
1076{
1077 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1078}
1079
1080Py_ssize_t
1081PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1082 PyObject *from, Py_ssize_t from_start,
1083 Py_ssize_t how_many)
1084{
1085 int err;
1086
1087 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1088 PyErr_BadInternalCall();
1089 return -1;
1090 }
1091
1092 if (PyUnicode_READY(from))
1093 return -1;
1094 if (PyUnicode_READY(to))
1095 return -1;
1096
1097 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1098 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1099 PyErr_Format(PyExc_SystemError,
1100 "Cannot write %zi characters at %zi "
1101 "in a string of %zi characters",
1102 how_many, to_start, PyUnicode_GET_LENGTH(to));
1103 return -1;
1104 }
1105
1106 if (how_many == 0)
1107 return 0;
1108
1109 if (_PyUnicode_Dirty(to))
1110 return -1;
1111
1112 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1113 if (err) {
1114 PyErr_Format(PyExc_SystemError,
1115 "Cannot copy %s characters "
1116 "into a string of %s characters",
1117 unicode_kind_name(from),
1118 unicode_kind_name(to));
1119 return -1;
1120 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001121 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122}
1123
Victor Stinner17222162011-09-28 22:15:37 +02001124/* Find the maximum code point and count the number of surrogate pairs so a
1125 correct string length can be computed before converting a string to UCS4.
1126 This function counts single surrogates as a character and not as a pair.
1127
1128 Return 0 on success, or -1 on error. */
1129static int
1130find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1131 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132{
1133 const wchar_t *iter;
1134
Victor Stinnerc53be962011-10-02 21:33:54 +02001135 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 *num_surrogates = 0;
1137 *maxchar = 0;
1138
1139 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001140 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001142#if SIZEOF_WCHAR_T != 2
1143 if (*maxchar >= 0x10000)
1144 return 0;
1145#endif
1146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147#if SIZEOF_WCHAR_T == 2
1148 if (*iter >= 0xD800 && *iter <= 0xDBFF
1149 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1150 {
1151 Py_UCS4 surrogate_val;
1152 surrogate_val = (((iter[0] & 0x3FF)<<10)
1153 | (iter[1] & 0x3FF)) + 0x10000;
1154 ++(*num_surrogates);
1155 if (surrogate_val > *maxchar)
1156 *maxchar = surrogate_val;
1157 iter += 2;
1158 }
1159 else
1160 iter++;
1161#else
1162 iter++;
1163#endif
1164 }
1165 return 0;
1166}
1167
1168#ifdef Py_DEBUG
1169int unicode_ready_calls = 0;
1170#endif
1171
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001172static int
1173unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001175 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 wchar_t *end;
1177 Py_UCS4 maxchar = 0;
1178 Py_ssize_t num_surrogates;
1179#if SIZEOF_WCHAR_T == 2
1180 Py_ssize_t length_wo_surrogates;
1181#endif
1182
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001183 assert(p_obj != NULL);
1184 unicode = (PyUnicodeObject *)*p_obj;
1185
Georg Brandl7597add2011-10-05 16:36:47 +02001186 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001187 strings were created using _PyObject_New() and where no canonical
1188 representation (the str field) has been set yet aka strings
1189 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001190 assert(_PyUnicode_CHECK(unicode));
1191 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001192 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001193 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001194 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001195 /* Actually, it should neither be interned nor be anything else: */
1196 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197
1198#ifdef Py_DEBUG
1199 ++unicode_ready_calls;
1200#endif
1201
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001202#ifdef Py_DEBUG
1203 assert(!replace || Py_REFCNT(unicode) == 1);
1204#else
1205 if (replace && Py_REFCNT(unicode) != 1)
1206 replace = 0;
1207#endif
1208 if (replace) {
1209 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1210 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1211 /* Optimization for empty strings */
1212 if (len == 0) {
1213 Py_INCREF(unicode_empty);
1214 Py_DECREF(*p_obj);
1215 *p_obj = unicode_empty;
1216 return 0;
1217 }
1218 if (len == 1 && wstr[0] < 256) {
1219 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1220 if (latin1_char == NULL)
1221 return -1;
1222 Py_DECREF(*p_obj);
1223 *p_obj = latin1_char;
1224 return 0;
1225 }
1226 }
1227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001229 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001230 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001232
1233 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001234 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1235 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001236 PyErr_NoMemory();
1237 return -1;
1238 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001239 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001240 _PyUnicode_WSTR(unicode), end,
1241 PyUnicode_1BYTE_DATA(unicode));
1242 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1243 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1244 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1245 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001246 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001247 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001248 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 }
1250 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001251 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001252 _PyUnicode_UTF8(unicode) = NULL;
1253 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 }
1255 PyObject_FREE(_PyUnicode_WSTR(unicode));
1256 _PyUnicode_WSTR(unicode) = NULL;
1257 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1258 }
1259 /* In this case we might have to convert down from 4-byte native
1260 wchar_t to 2-byte unicode. */
1261 else if (maxchar < 65536) {
1262 assert(num_surrogates == 0 &&
1263 "FindMaxCharAndNumSurrogatePairs() messed up");
1264
Victor Stinner506f5922011-09-28 22:34:18 +02001265#if SIZEOF_WCHAR_T == 2
1266 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001267 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001268 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1269 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1270 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001271 _PyUnicode_UTF8(unicode) = NULL;
1272 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001273#else
1274 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001275 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001276 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001277 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001278 PyErr_NoMemory();
1279 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280 }
Victor Stinner506f5922011-09-28 22:34:18 +02001281 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1282 _PyUnicode_WSTR(unicode), end,
1283 PyUnicode_2BYTE_DATA(unicode));
1284 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1285 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1286 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001287 _PyUnicode_UTF8(unicode) = NULL;
1288 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001289 PyObject_FREE(_PyUnicode_WSTR(unicode));
1290 _PyUnicode_WSTR(unicode) = NULL;
1291 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1292#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 }
1294 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1295 else {
1296#if SIZEOF_WCHAR_T == 2
1297 /* in case the native representation is 2-bytes, we need to allocate a
1298 new normalized 4-byte version. */
1299 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001300 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1301 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001302 PyErr_NoMemory();
1303 return -1;
1304 }
1305 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1306 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001307 _PyUnicode_UTF8(unicode) = NULL;
1308 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001309 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1310 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001311 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 PyObject_FREE(_PyUnicode_WSTR(unicode));
1313 _PyUnicode_WSTR(unicode) = NULL;
1314 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1315#else
1316 assert(num_surrogates == 0);
1317
Victor Stinnerc3c74152011-10-02 20:39:55 +02001318 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001320 _PyUnicode_UTF8(unicode) = NULL;
1321 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1323#endif
1324 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1325 }
1326 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001327 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return 0;
1329}
1330
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001331int
1332_PyUnicode_ReadyReplace(PyObject **op)
1333{
1334 return unicode_ready(op, 1);
1335}
1336
1337int
1338_PyUnicode_Ready(PyObject *op)
1339{
1340 return unicode_ready(&op, 0);
1341}
1342
Alexander Belopolsky40018472011-02-26 01:02:56 +00001343static void
1344unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345{
Walter Dörwald16807132007-05-25 13:52:07 +00001346 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001347 case SSTATE_NOT_INTERNED:
1348 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001349
Benjamin Peterson29060642009-01-31 22:14:21 +00001350 case SSTATE_INTERNED_MORTAL:
1351 /* revive dead object temporarily for DelItem */
1352 Py_REFCNT(unicode) = 3;
1353 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1354 Py_FatalError(
1355 "deletion of interned string failed");
1356 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001357
Benjamin Peterson29060642009-01-31 22:14:21 +00001358 case SSTATE_INTERNED_IMMORTAL:
1359 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001360
Benjamin Peterson29060642009-01-31 22:14:21 +00001361 default:
1362 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001363 }
1364
Victor Stinner03490912011-10-03 23:45:12 +02001365 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001367 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001368 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369
1370 if (PyUnicode_IS_COMPACT(unicode)) {
1371 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 }
1373 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001374 if (_PyUnicode_DATA_ANY(unicode))
1375 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
1378}
1379
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001380#ifdef Py_DEBUG
1381static int
1382unicode_is_singleton(PyObject *unicode)
1383{
1384 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1385 if (unicode == unicode_empty)
1386 return 1;
1387 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1388 {
1389 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1390 if (ch < 256 && unicode_latin1[ch] == unicode)
1391 return 1;
1392 }
1393 return 0;
1394}
1395#endif
1396
Alexander Belopolsky40018472011-02-26 01:02:56 +00001397static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001398unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001399{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001400 if (Py_REFCNT(unicode) != 1)
1401 return 0;
1402 if (PyUnicode_CHECK_INTERNED(unicode))
1403 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001404#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001405 /* singleton refcount is greater than 1 */
1406 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001407#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001408 return 1;
1409}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001410
Victor Stinnerfe226c02011-10-03 03:52:20 +02001411static int
1412unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1413{
1414 PyObject *unicode;
1415 Py_ssize_t old_length;
1416
1417 assert(p_unicode != NULL);
1418 unicode = *p_unicode;
1419
1420 assert(unicode != NULL);
1421 assert(PyUnicode_Check(unicode));
1422 assert(0 <= length);
1423
Victor Stinner910337b2011-10-03 03:20:16 +02001424 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001425 old_length = PyUnicode_WSTR_LENGTH(unicode);
1426 else
1427 old_length = PyUnicode_GET_LENGTH(unicode);
1428 if (old_length == length)
1429 return 0;
1430
Victor Stinnerfe226c02011-10-03 03:52:20 +02001431 if (!unicode_resizable(unicode)) {
1432 PyObject *copy = resize_copy(unicode, length);
1433 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001435 Py_DECREF(*p_unicode);
1436 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001437 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001438 }
1439
Victor Stinnerfe226c02011-10-03 03:52:20 +02001440 if (PyUnicode_IS_COMPACT(unicode)) {
1441 *p_unicode = resize_compact(unicode, length);
1442 if (*p_unicode == NULL)
1443 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001444 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001445 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001446 }
1447 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001448}
1449
Alexander Belopolsky40018472011-02-26 01:02:56 +00001450int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001451PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001452{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001453 PyObject *unicode;
1454 if (p_unicode == NULL) {
1455 PyErr_BadInternalCall();
1456 return -1;
1457 }
1458 unicode = *p_unicode;
1459 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1460 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1461 {
1462 PyErr_BadInternalCall();
1463 return -1;
1464 }
1465 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001466}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468static PyObject*
1469get_latin1_char(unsigned char ch)
1470{
Victor Stinnera464fc12011-10-02 20:39:30 +02001471 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001473 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 if (!unicode)
1475 return NULL;
1476 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001477 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 unicode_latin1[ch] = unicode;
1479 }
1480 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001481 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482}
1483
Alexander Belopolsky40018472011-02-26 01:02:56 +00001484PyObject *
1485PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486{
1487 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 Py_UCS4 maxchar = 0;
1489 Py_ssize_t num_surrogates;
1490
1491 if (u == NULL)
1492 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001494 /* If the Unicode data is known at construction time, we can apply
1495 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 /* Optimization for empty strings */
1498 if (size == 0 && unicode_empty != NULL) {
1499 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001500 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001501 }
Tim Petersced69f82003-09-16 20:30:58 +00001502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503 /* Single character Unicode objects in the Latin-1 range are
1504 shared when using this constructor */
1505 if (size == 1 && *u < 256)
1506 return get_latin1_char((unsigned char)*u);
1507
1508 /* If not empty and not single character, copy the Unicode data
1509 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001510 if (find_maxchar_surrogates(u, u + size,
1511 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 return NULL;
1513
1514 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1515 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 if (!unicode)
1517 return NULL;
1518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 switch (PyUnicode_KIND(unicode)) {
1520 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001521 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1523 break;
1524 case PyUnicode_2BYTE_KIND:
1525#if Py_UNICODE_SIZE == 2
1526 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1527#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001528 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1530#endif
1531 break;
1532 case PyUnicode_4BYTE_KIND:
1533#if SIZEOF_WCHAR_T == 2
1534 /* This is the only case which has to process surrogates, thus
1535 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001536 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537#else
1538 assert(num_surrogates == 0);
1539 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1540#endif
1541 break;
1542 default:
1543 assert(0 && "Impossible state");
1544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001546 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547 return (PyObject *)unicode;
1548}
1549
Alexander Belopolsky40018472011-02-26 01:02:56 +00001550PyObject *
1551PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001552{
1553 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001554
Benjamin Peterson14339b62009-01-31 16:36:08 +00001555 if (size < 0) {
1556 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001557 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001558 return NULL;
1559 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001561 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001562 some optimizations which share commonly used objects.
1563 Also, this means the input must be UTF-8, so fall back to the
1564 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001565 if (u != NULL) {
1566
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 /* Optimization for empty strings */
1568 if (size == 0 && unicode_empty != NULL) {
1569 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001570 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001572
1573 /* Single characters are shared when using this constructor.
1574 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 if (size == 1 && Py_CHARMASK(*u) < 128)
1576 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001577
1578 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001579 }
1580
Walter Dörwald55507312007-05-18 13:12:10 +00001581 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001582 if (!unicode)
1583 return NULL;
1584
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001585 return (PyObject *)unicode;
1586}
1587
Alexander Belopolsky40018472011-02-26 01:02:56 +00001588PyObject *
1589PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001590{
1591 size_t size = strlen(u);
1592 if (size > PY_SSIZE_T_MAX) {
1593 PyErr_SetString(PyExc_OverflowError, "input too long");
1594 return NULL;
1595 }
1596
1597 return PyUnicode_FromStringAndSize(u, size);
1598}
1599
Victor Stinnere57b1c02011-09-28 22:20:48 +02001600static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001601unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001602{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001603 PyObject *res;
1604#ifdef Py_DEBUG
1605 const unsigned char *p;
1606 const unsigned char *end = s + size;
1607 for (p=s; p < end; p++) {
1608 assert(*p < 128);
1609 }
1610#endif
1611 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001612 if (!res)
1613 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001614 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001615 return res;
1616}
1617
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001618static Py_UCS4
1619kind_maxchar_limit(unsigned int kind)
1620{
1621 switch(kind) {
1622 case PyUnicode_1BYTE_KIND:
1623 return 0x80;
1624 case PyUnicode_2BYTE_KIND:
1625 return 0x100;
1626 case PyUnicode_4BYTE_KIND:
1627 return 0x10000;
1628 default:
1629 assert(0 && "invalid kind");
1630 return 0x10ffff;
1631 }
1632}
1633
Victor Stinner702c7342011-10-05 13:50:52 +02001634static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001635_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001637 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001638 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001640
1641 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 for (i = 0; i < size; i++) {
1643 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001644 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001646 }
1647 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001648 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 if (!res)
1650 return NULL;
1651 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001652 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001654}
1655
Victor Stinnere57b1c02011-09-28 22:20:48 +02001656static PyObject*
1657_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658{
1659 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001660 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001662
1663 assert(size >= 0);
1664 for (i = 0; i < size; i++) {
1665 if (u[i] > max_char) {
1666 max_char = u[i];
1667 if (max_char >= 256)
1668 break;
1669 }
1670 }
1671 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 if (!res)
1673 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001674 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1676 else
1677 for (i = 0; i < size; i++)
1678 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001679 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 return res;
1681}
1682
Victor Stinnere57b1c02011-09-28 22:20:48 +02001683static PyObject*
1684_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685{
1686 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001687 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001689
1690 assert(size >= 0);
1691 for (i = 0; i < size; i++) {
1692 if (u[i] > max_char) {
1693 max_char = u[i];
1694 if (max_char >= 0x10000)
1695 break;
1696 }
1697 }
1698 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 if (!res)
1700 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001701 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1703 else {
1704 int kind = PyUnicode_KIND(res);
1705 void *data = PyUnicode_DATA(res);
1706 for (i = 0; i < size; i++)
1707 PyUnicode_WRITE(kind, data, i, u[i]);
1708 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001709 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 return res;
1711}
1712
1713PyObject*
1714PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1715{
1716 switch(kind) {
1717 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001718 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001720 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001722 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001723 default:
1724 assert(0 && "invalid kind");
1725 PyErr_SetString(PyExc_SystemError, "invalid kind");
1726 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728}
1729
Victor Stinner034f6cf2011-09-30 02:26:44 +02001730PyObject*
1731PyUnicode_Copy(PyObject *unicode)
1732{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001733 Py_ssize_t size;
1734 PyObject *copy;
1735 void *data;
1736
Victor Stinner034f6cf2011-09-30 02:26:44 +02001737 if (!PyUnicode_Check(unicode)) {
1738 PyErr_BadInternalCall();
1739 return NULL;
1740 }
1741 if (PyUnicode_READY(unicode))
1742 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001743
1744 size = PyUnicode_GET_LENGTH(unicode);
1745 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1746 if (!copy)
1747 return NULL;
1748 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1749
1750 data = PyUnicode_DATA(unicode);
1751 switch (PyUnicode_KIND(unicode))
1752 {
1753 case PyUnicode_1BYTE_KIND:
1754 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1755 break;
1756 case PyUnicode_2BYTE_KIND:
1757 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1758 break;
1759 case PyUnicode_4BYTE_KIND:
1760 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1761 break;
1762 default:
1763 assert(0);
1764 break;
1765 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001766 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001767 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001768}
1769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770
Victor Stinnerbc603d12011-10-02 01:00:40 +02001771/* Widen Unicode objects to larger buffers. Don't write terminating null
1772 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773
1774void*
1775_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1776{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001777 Py_ssize_t len;
1778 void *result;
1779 unsigned int skind;
1780
1781 if (PyUnicode_READY(s))
1782 return NULL;
1783
1784 len = PyUnicode_GET_LENGTH(s);
1785 skind = PyUnicode_KIND(s);
1786 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001787 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 return NULL;
1789 }
1790 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001791 case PyUnicode_2BYTE_KIND:
1792 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1793 if (!result)
1794 return PyErr_NoMemory();
1795 assert(skind == PyUnicode_1BYTE_KIND);
1796 _PyUnicode_CONVERT_BYTES(
1797 Py_UCS1, Py_UCS2,
1798 PyUnicode_1BYTE_DATA(s),
1799 PyUnicode_1BYTE_DATA(s) + len,
1800 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001802 case PyUnicode_4BYTE_KIND:
1803 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1804 if (!result)
1805 return PyErr_NoMemory();
1806 if (skind == PyUnicode_2BYTE_KIND) {
1807 _PyUnicode_CONVERT_BYTES(
1808 Py_UCS2, Py_UCS4,
1809 PyUnicode_2BYTE_DATA(s),
1810 PyUnicode_2BYTE_DATA(s) + len,
1811 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001813 else {
1814 assert(skind == PyUnicode_1BYTE_KIND);
1815 _PyUnicode_CONVERT_BYTES(
1816 Py_UCS1, Py_UCS4,
1817 PyUnicode_1BYTE_DATA(s),
1818 PyUnicode_1BYTE_DATA(s) + len,
1819 result);
1820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001822 default:
1823 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 }
Victor Stinner01698042011-10-04 00:04:26 +02001825 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return NULL;
1827}
1828
1829static Py_UCS4*
1830as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1831 int copy_null)
1832{
1833 int kind;
1834 void *data;
1835 Py_ssize_t len, targetlen;
1836 if (PyUnicode_READY(string) == -1)
1837 return NULL;
1838 kind = PyUnicode_KIND(string);
1839 data = PyUnicode_DATA(string);
1840 len = PyUnicode_GET_LENGTH(string);
1841 targetlen = len;
1842 if (copy_null)
1843 targetlen++;
1844 if (!target) {
1845 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1846 PyErr_NoMemory();
1847 return NULL;
1848 }
1849 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1850 if (!target) {
1851 PyErr_NoMemory();
1852 return NULL;
1853 }
1854 }
1855 else {
1856 if (targetsize < targetlen) {
1857 PyErr_Format(PyExc_SystemError,
1858 "string is longer than the buffer");
1859 if (copy_null && 0 < targetsize)
1860 target[0] = 0;
1861 return NULL;
1862 }
1863 }
1864 if (kind != PyUnicode_4BYTE_KIND) {
1865 Py_ssize_t i;
1866 for (i = 0; i < len; i++)
1867 target[i] = PyUnicode_READ(kind, data, i);
1868 }
1869 else
1870 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1871 if (copy_null)
1872 target[len] = 0;
1873 return target;
1874}
1875
1876Py_UCS4*
1877PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1878 int copy_null)
1879{
1880 if (target == NULL || targetsize < 1) {
1881 PyErr_BadInternalCall();
1882 return NULL;
1883 }
1884 return as_ucs4(string, target, targetsize, copy_null);
1885}
1886
1887Py_UCS4*
1888PyUnicode_AsUCS4Copy(PyObject *string)
1889{
1890 return as_ucs4(string, NULL, 0, 1);
1891}
1892
1893#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001894
Alexander Belopolsky40018472011-02-26 01:02:56 +00001895PyObject *
1896PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001899 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 PyErr_BadInternalCall();
1902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 }
1904
Martin v. Löwis790465f2008-04-05 20:41:37 +00001905 if (size == -1) {
1906 size = wcslen(w);
1907 }
1908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910}
1911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001913
Walter Dörwald346737f2007-05-31 10:44:43 +00001914static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001915makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1916 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001917{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001918 *fmt++ = '%';
1919 if (width) {
1920 if (zeropad)
1921 *fmt++ = '0';
1922 fmt += sprintf(fmt, "%d", width);
1923 }
1924 if (precision)
1925 fmt += sprintf(fmt, ".%d", precision);
1926 if (longflag)
1927 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001928 else if (longlongflag) {
1929 /* longlongflag should only ever be nonzero on machines with
1930 HAVE_LONG_LONG defined */
1931#ifdef HAVE_LONG_LONG
1932 char *f = PY_FORMAT_LONG_LONG;
1933 while (*f)
1934 *fmt++ = *f++;
1935#else
1936 /* we shouldn't ever get here */
1937 assert(0);
1938 *fmt++ = 'l';
1939#endif
1940 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001941 else if (size_tflag) {
1942 char *f = PY_FORMAT_SIZE_T;
1943 while (*f)
1944 *fmt++ = *f++;
1945 }
1946 *fmt++ = c;
1947 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001948}
1949
Victor Stinner96865452011-03-01 23:44:09 +00001950/* helper for PyUnicode_FromFormatV() */
1951
1952static const char*
1953parse_format_flags(const char *f,
1954 int *p_width, int *p_precision,
1955 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1956{
1957 int width, precision, longflag, longlongflag, size_tflag;
1958
1959 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1960 f++;
1961 width = 0;
1962 while (Py_ISDIGIT((unsigned)*f))
1963 width = (width*10) + *f++ - '0';
1964 precision = 0;
1965 if (*f == '.') {
1966 f++;
1967 while (Py_ISDIGIT((unsigned)*f))
1968 precision = (precision*10) + *f++ - '0';
1969 if (*f == '%') {
1970 /* "%.3%s" => f points to "3" */
1971 f--;
1972 }
1973 }
1974 if (*f == '\0') {
1975 /* bogus format "%.1" => go backward, f points to "1" */
1976 f--;
1977 }
1978 if (p_width != NULL)
1979 *p_width = width;
1980 if (p_precision != NULL)
1981 *p_precision = precision;
1982
1983 /* Handle %ld, %lu, %lld and %llu. */
1984 longflag = 0;
1985 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001986 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001987
1988 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001989 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001990 longflag = 1;
1991 ++f;
1992 }
1993#ifdef HAVE_LONG_LONG
1994 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001995 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001996 longlongflag = 1;
1997 f += 2;
1998 }
1999#endif
2000 }
2001 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002002 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002003 size_tflag = 1;
2004 ++f;
2005 }
2006 if (p_longflag != NULL)
2007 *p_longflag = longflag;
2008 if (p_longlongflag != NULL)
2009 *p_longlongflag = longlongflag;
2010 if (p_size_tflag != NULL)
2011 *p_size_tflag = size_tflag;
2012 return f;
2013}
2014
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002015/* maximum number of characters required for output of %ld. 21 characters
2016 allows for 64-bit integers (in decimal) and an optional sign. */
2017#define MAX_LONG_CHARS 21
2018/* maximum number of characters required for output of %lld.
2019 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2020 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2021#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2022
Walter Dörwaldd2034312007-05-18 16:29:38 +00002023PyObject *
2024PyUnicode_FromFormatV(const char *format, va_list vargs)
2025{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002026 va_list count;
2027 Py_ssize_t callcount = 0;
2028 PyObject **callresults = NULL;
2029 PyObject **callresult = NULL;
2030 Py_ssize_t n = 0;
2031 int width = 0;
2032 int precision = 0;
2033 int zeropad;
2034 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002035 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002036 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002037 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2039 Py_UCS4 argmaxchar;
2040 Py_ssize_t numbersize = 0;
2041 char *numberresults = NULL;
2042 char *numberresult = NULL;
2043 Py_ssize_t i;
2044 int kind;
2045 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002046
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002047 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002048 /* step 1: count the number of %S/%R/%A/%s format specifications
2049 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2050 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002052 * also estimate a upper bound for all the number formats in the string,
2053 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 for (f = format; *f; f++) {
2056 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002057 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2059 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2060 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2061 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002064#ifdef HAVE_LONG_LONG
2065 if (longlongflag) {
2066 if (width < MAX_LONG_LONG_CHARS)
2067 width = MAX_LONG_LONG_CHARS;
2068 }
2069 else
2070#endif
2071 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2072 including sign. Decimal takes the most space. This
2073 isn't enough for octal. If a width is specified we
2074 need more (which we allocate later). */
2075 if (width < MAX_LONG_CHARS)
2076 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077
2078 /* account for the size + '\0' to separate numbers
2079 inside of the numberresults buffer */
2080 numbersize += (width + 1);
2081 }
2082 }
2083 else if ((unsigned char)*f > 127) {
2084 PyErr_Format(PyExc_ValueError,
2085 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2086 "string, got a non-ASCII byte: 0x%02x",
2087 (unsigned char)*f);
2088 return NULL;
2089 }
2090 }
2091 /* step 2: allocate memory for the results of
2092 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2093 if (callcount) {
2094 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2095 if (!callresults) {
2096 PyErr_NoMemory();
2097 return NULL;
2098 }
2099 callresult = callresults;
2100 }
2101 /* step 2.5: allocate memory for the results of formating numbers */
2102 if (numbersize) {
2103 numberresults = PyObject_Malloc(numbersize);
2104 if (!numberresults) {
2105 PyErr_NoMemory();
2106 goto fail;
2107 }
2108 numberresult = numberresults;
2109 }
2110
2111 /* step 3: format numbers and figure out how large a buffer we need */
2112 for (f = format; *f; f++) {
2113 if (*f == '%') {
2114 const char* p;
2115 int longflag;
2116 int longlongflag;
2117 int size_tflag;
2118 int numprinted;
2119
2120 p = f;
2121 zeropad = (f[1] == '0');
2122 f = parse_format_flags(f, &width, &precision,
2123 &longflag, &longlongflag, &size_tflag);
2124 switch (*f) {
2125 case 'c':
2126 {
2127 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002128 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 n++;
2130 break;
2131 }
2132 case '%':
2133 n++;
2134 break;
2135 case 'i':
2136 case 'd':
2137 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2138 width, precision, *f);
2139 if (longflag)
2140 numprinted = sprintf(numberresult, fmt,
2141 va_arg(count, long));
2142#ifdef HAVE_LONG_LONG
2143 else if (longlongflag)
2144 numprinted = sprintf(numberresult, fmt,
2145 va_arg(count, PY_LONG_LONG));
2146#endif
2147 else if (size_tflag)
2148 numprinted = sprintf(numberresult, fmt,
2149 va_arg(count, Py_ssize_t));
2150 else
2151 numprinted = sprintf(numberresult, fmt,
2152 va_arg(count, int));
2153 n += numprinted;
2154 /* advance by +1 to skip over the '\0' */
2155 numberresult += (numprinted + 1);
2156 assert(*(numberresult - 1) == '\0');
2157 assert(*(numberresult - 2) != '\0');
2158 assert(numprinted >= 0);
2159 assert(numberresult <= numberresults + numbersize);
2160 break;
2161 case 'u':
2162 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2163 width, precision, 'u');
2164 if (longflag)
2165 numprinted = sprintf(numberresult, fmt,
2166 va_arg(count, unsigned long));
2167#ifdef HAVE_LONG_LONG
2168 else if (longlongflag)
2169 numprinted = sprintf(numberresult, fmt,
2170 va_arg(count, unsigned PY_LONG_LONG));
2171#endif
2172 else if (size_tflag)
2173 numprinted = sprintf(numberresult, fmt,
2174 va_arg(count, size_t));
2175 else
2176 numprinted = sprintf(numberresult, fmt,
2177 va_arg(count, unsigned int));
2178 n += numprinted;
2179 numberresult += (numprinted + 1);
2180 assert(*(numberresult - 1) == '\0');
2181 assert(*(numberresult - 2) != '\0');
2182 assert(numprinted >= 0);
2183 assert(numberresult <= numberresults + numbersize);
2184 break;
2185 case 'x':
2186 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2187 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2188 n += numprinted;
2189 numberresult += (numprinted + 1);
2190 assert(*(numberresult - 1) == '\0');
2191 assert(*(numberresult - 2) != '\0');
2192 assert(numprinted >= 0);
2193 assert(numberresult <= numberresults + numbersize);
2194 break;
2195 case 'p':
2196 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2197 /* %p is ill-defined: ensure leading 0x. */
2198 if (numberresult[1] == 'X')
2199 numberresult[1] = 'x';
2200 else if (numberresult[1] != 'x') {
2201 memmove(numberresult + 2, numberresult,
2202 strlen(numberresult) + 1);
2203 numberresult[0] = '0';
2204 numberresult[1] = 'x';
2205 numprinted += 2;
2206 }
2207 n += numprinted;
2208 numberresult += (numprinted + 1);
2209 assert(*(numberresult - 1) == '\0');
2210 assert(*(numberresult - 2) != '\0');
2211 assert(numprinted >= 0);
2212 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002213 break;
2214 case 's':
2215 {
2216 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002217 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002218 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2219 if (!str)
2220 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 /* since PyUnicode_DecodeUTF8 returns already flexible
2222 unicode objects, there is no need to call ready on them */
2223 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002224 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002226 /* Remember the str and switch to the next slot */
2227 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 break;
2229 }
2230 case 'U':
2231 {
2232 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002233 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 if (PyUnicode_READY(obj) == -1)
2235 goto fail;
2236 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002237 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002239 break;
2240 }
2241 case 'V':
2242 {
2243 PyObject *obj = va_arg(count, PyObject *);
2244 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002245 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002246 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002247 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002248 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 if (PyUnicode_READY(obj) == -1)
2250 goto fail;
2251 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002252 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002254 *callresult++ = NULL;
2255 }
2256 else {
2257 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2258 if (!str_obj)
2259 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002260 if (PyUnicode_READY(str_obj)) {
2261 Py_DECREF(str_obj);
2262 goto fail;
2263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002265 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002267 *callresult++ = str_obj;
2268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 break;
2270 }
2271 case 'S':
2272 {
2273 PyObject *obj = va_arg(count, PyObject *);
2274 PyObject *str;
2275 assert(obj);
2276 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002280 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 /* Remember the str and switch to the next slot */
2283 *callresult++ = str;
2284 break;
2285 }
2286 case 'R':
2287 {
2288 PyObject *obj = va_arg(count, PyObject *);
2289 PyObject *repr;
2290 assert(obj);
2291 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002295 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002297 /* Remember the repr and switch to the next slot */
2298 *callresult++ = repr;
2299 break;
2300 }
2301 case 'A':
2302 {
2303 PyObject *obj = va_arg(count, PyObject *);
2304 PyObject *ascii;
2305 assert(obj);
2306 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002308 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002310 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002312 /* Remember the repr and switch to the next slot */
2313 *callresult++ = ascii;
2314 break;
2315 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002316 default:
2317 /* if we stumble upon an unknown
2318 formatting code, copy the rest of
2319 the format string to the output
2320 string. (we cannot just skip the
2321 code, since there's no way to know
2322 what's in the argument list) */
2323 n += strlen(p);
2324 goto expand;
2325 }
2326 } else
2327 n++;
2328 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002329 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002330 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002331 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002332 we don't have to resize the string.
2333 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002334 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 if (!string)
2336 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 kind = PyUnicode_KIND(string);
2338 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002339 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002340 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002343 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002344 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002345
2346 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002347 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2348 /* checking for == because the last argument could be a empty
2349 string, which causes i to point to end, the assert at the end of
2350 the loop */
2351 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002352
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 switch (*f) {
2354 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002355 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 const int ordinal = va_arg(vargs, int);
2357 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002358 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002359 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002360 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002362 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002363 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 case 'p':
2365 /* unused, since we already have the result */
2366 if (*f == 'p')
2367 (void) va_arg(vargs, void *);
2368 else
2369 (void) va_arg(vargs, int);
2370 /* extract the result from numberresults and append. */
2371 for (; *numberresult; ++i, ++numberresult)
2372 PyUnicode_WRITE(kind, data, i, *numberresult);
2373 /* skip over the separating '\0' */
2374 assert(*numberresult == '\0');
2375 numberresult++;
2376 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002377 break;
2378 case 's':
2379 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002380 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002382 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 size = PyUnicode_GET_LENGTH(*callresult);
2384 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002385 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002387 /* We're done with the unicode()/repr() => forget it */
2388 Py_DECREF(*callresult);
2389 /* switch to next unicode()/repr() result */
2390 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002391 break;
2392 }
2393 case 'U':
2394 {
2395 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396 Py_ssize_t size;
2397 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2398 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002399 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 break;
2402 }
2403 case 'V':
2404 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002406 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002407 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002408 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 size = PyUnicode_GET_LENGTH(obj);
2410 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002411 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002413 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 size = PyUnicode_GET_LENGTH(*callresult);
2415 assert(PyUnicode_KIND(*callresult) <=
2416 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002417 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002419 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002421 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002422 break;
2423 }
2424 case 'S':
2425 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002426 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002428 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002429 /* unused, since we already have the result */
2430 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002431 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002432 copy_characters(string, i, *callresult, 0, size);
2433 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 /* We're done with the unicode()/repr() => forget it */
2435 Py_DECREF(*callresult);
2436 /* switch to next unicode()/repr() result */
2437 ++callresult;
2438 break;
2439 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 for (; *p; ++p, ++i)
2445 PyUnicode_WRITE(kind, data, i, *p);
2446 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002447 goto end;
2448 }
Victor Stinner1205f272010-09-11 00:54:47 +00002449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 else {
2451 assert(i < PyUnicode_GET_LENGTH(string));
2452 PyUnicode_WRITE(kind, data, i++, *f);
2453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002456
Benjamin Peterson29060642009-01-31 22:14:21 +00002457 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002458 if (callresults)
2459 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 if (numberresults)
2461 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002462 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002465 if (callresults) {
2466 PyObject **callresult2 = callresults;
2467 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 ++callresult2;
2470 }
2471 PyObject_Free(callresults);
2472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 if (numberresults)
2474 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002475 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002476}
2477
Walter Dörwaldd2034312007-05-18 16:29:38 +00002478PyObject *
2479PyUnicode_FromFormat(const char *format, ...)
2480{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002481 PyObject* ret;
2482 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002483
2484#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002485 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002486#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002487 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002488#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 ret = PyUnicode_FromFormatV(format, vargs);
2490 va_end(vargs);
2491 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002492}
2493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494#ifdef HAVE_WCHAR_H
2495
Victor Stinner5593d8a2010-10-02 11:11:27 +00002496/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2497 convert a Unicode object to a wide character string.
2498
Victor Stinnerd88d9832011-09-06 02:00:05 +02002499 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002500 character) required to convert the unicode object. Ignore size argument.
2501
Victor Stinnerd88d9832011-09-06 02:00:05 +02002502 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002503 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002504 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002505static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002506unicode_aswidechar(PyUnicodeObject *unicode,
2507 wchar_t *w,
2508 Py_ssize_t size)
2509{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002510 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 const wchar_t *wstr;
2512
2513 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2514 if (wstr == NULL)
2515 return -1;
2516
Victor Stinner5593d8a2010-10-02 11:11:27 +00002517 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002518 if (size > res)
2519 size = res + 1;
2520 else
2521 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002523 return res;
2524 }
2525 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002527}
2528
2529Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002530PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002531 wchar_t *w,
2532 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533{
2534 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002535 PyErr_BadInternalCall();
2536 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002538 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539}
2540
Victor Stinner137c34c2010-09-29 10:25:54 +00002541wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002542PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002543 Py_ssize_t *size)
2544{
2545 wchar_t* buffer;
2546 Py_ssize_t buflen;
2547
2548 if (unicode == NULL) {
2549 PyErr_BadInternalCall();
2550 return NULL;
2551 }
2552
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002553 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 if (buflen == -1)
2555 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002556 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002557 PyErr_NoMemory();
2558 return NULL;
2559 }
2560
Victor Stinner137c34c2010-09-29 10:25:54 +00002561 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2562 if (buffer == NULL) {
2563 PyErr_NoMemory();
2564 return NULL;
2565 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002566 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 if (buflen == -1)
2568 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002569 if (size != NULL)
2570 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002571 return buffer;
2572}
2573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575
Alexander Belopolsky40018472011-02-26 01:02:56 +00002576PyObject *
2577PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002579 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002580 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 PyErr_SetString(PyExc_ValueError,
2582 "chr() arg not in range(0x110000)");
2583 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002584 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 if (ordinal < 256)
2587 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 v = PyUnicode_New(1, ordinal);
2590 if (v == NULL)
2591 return NULL;
2592 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002593 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002595}
2596
Alexander Belopolsky40018472011-02-26 01:02:56 +00002597PyObject *
2598PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002600 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002601 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002602 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002603 if (PyUnicode_READY(obj))
2604 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002605 Py_INCREF(obj);
2606 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002607 }
2608 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002609 /* For a Unicode subtype that's not a Unicode object,
2610 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002611 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002612 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002613 PyErr_Format(PyExc_TypeError,
2614 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002615 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002616 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002617}
2618
Alexander Belopolsky40018472011-02-26 01:02:56 +00002619PyObject *
2620PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002621 const char *encoding,
2622 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002623{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002624 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002625 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002626
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002628 PyErr_BadInternalCall();
2629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002631
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002632 /* Decoding bytes objects is the most common case and should be fast */
2633 if (PyBytes_Check(obj)) {
2634 if (PyBytes_GET_SIZE(obj) == 0) {
2635 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002636 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002637 }
2638 else {
2639 v = PyUnicode_Decode(
2640 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2641 encoding, errors);
2642 }
2643 return v;
2644 }
2645
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002646 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002647 PyErr_SetString(PyExc_TypeError,
2648 "decoding str is not supported");
2649 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002651
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002652 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2653 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2654 PyErr_Format(PyExc_TypeError,
2655 "coercing to str: need bytes, bytearray "
2656 "or buffer-like object, %.80s found",
2657 Py_TYPE(obj)->tp_name);
2658 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002659 }
Tim Petersced69f82003-09-16 20:30:58 +00002660
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002661 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002662 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002663 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664 }
Tim Petersced69f82003-09-16 20:30:58 +00002665 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002666 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002667
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002668 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002669 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670}
2671
Victor Stinner600d3be2010-06-10 12:00:55 +00002672/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002673 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2674 1 on success. */
2675static int
2676normalize_encoding(const char *encoding,
2677 char *lower,
2678 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002680 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002681 char *l;
2682 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002684 e = encoding;
2685 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002686 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002687 while (*e) {
2688 if (l == l_end)
2689 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002690 if (Py_ISUPPER(*e)) {
2691 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002692 }
2693 else if (*e == '_') {
2694 *l++ = '-';
2695 e++;
2696 }
2697 else {
2698 *l++ = *e++;
2699 }
2700 }
2701 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002702 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002703}
2704
Alexander Belopolsky40018472011-02-26 01:02:56 +00002705PyObject *
2706PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002707 Py_ssize_t size,
2708 const char *encoding,
2709 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002710{
2711 PyObject *buffer = NULL, *unicode;
2712 Py_buffer info;
2713 char lower[11]; /* Enough for any encoding shortcut */
2714
2715 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002716 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002717
2718 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002719 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002720 if ((strcmp(lower, "utf-8") == 0) ||
2721 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002722 return PyUnicode_DecodeUTF8(s, size, errors);
2723 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002724 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002725 (strcmp(lower, "iso-8859-1") == 0))
2726 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002727#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002728 else if (strcmp(lower, "mbcs") == 0)
2729 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002730#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002731 else if (strcmp(lower, "ascii") == 0)
2732 return PyUnicode_DecodeASCII(s, size, errors);
2733 else if (strcmp(lower, "utf-16") == 0)
2734 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2735 else if (strcmp(lower, "utf-32") == 0)
2736 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738
2739 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002740 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002741 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002742 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002743 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 if (buffer == NULL)
2745 goto onError;
2746 unicode = PyCodec_Decode(buffer, encoding, errors);
2747 if (unicode == NULL)
2748 goto onError;
2749 if (!PyUnicode_Check(unicode)) {
2750 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002751 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002752 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 Py_DECREF(unicode);
2754 goto onError;
2755 }
2756 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002757#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002758 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002759 Py_DECREF(unicode);
2760 return NULL;
2761 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002762#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002763 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002765
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 Py_XDECREF(buffer);
2768 return NULL;
2769}
2770
Alexander Belopolsky40018472011-02-26 01:02:56 +00002771PyObject *
2772PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002773 const char *encoding,
2774 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002775{
2776 PyObject *v;
2777
2778 if (!PyUnicode_Check(unicode)) {
2779 PyErr_BadArgument();
2780 goto onError;
2781 }
2782
2783 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002784 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002785
2786 /* Decode via the codec registry */
2787 v = PyCodec_Decode(unicode, encoding, errors);
2788 if (v == NULL)
2789 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002790 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002791 return v;
2792
Benjamin Peterson29060642009-01-31 22:14:21 +00002793 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002794 return NULL;
2795}
2796
Alexander Belopolsky40018472011-02-26 01:02:56 +00002797PyObject *
2798PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002799 const char *encoding,
2800 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002801{
2802 PyObject *v;
2803
2804 if (!PyUnicode_Check(unicode)) {
2805 PyErr_BadArgument();
2806 goto onError;
2807 }
2808
2809 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002811
2812 /* Decode via the codec registry */
2813 v = PyCodec_Decode(unicode, encoding, errors);
2814 if (v == NULL)
2815 goto onError;
2816 if (!PyUnicode_Check(v)) {
2817 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002818 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002819 Py_TYPE(v)->tp_name);
2820 Py_DECREF(v);
2821 goto onError;
2822 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002823 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002824 return v;
2825
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002827 return NULL;
2828}
2829
Alexander Belopolsky40018472011-02-26 01:02:56 +00002830PyObject *
2831PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002832 Py_ssize_t size,
2833 const char *encoding,
2834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835{
2836 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002837
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 unicode = PyUnicode_FromUnicode(s, size);
2839 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2842 Py_DECREF(unicode);
2843 return v;
2844}
2845
Alexander Belopolsky40018472011-02-26 01:02:56 +00002846PyObject *
2847PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002848 const char *encoding,
2849 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002850{
2851 PyObject *v;
2852
2853 if (!PyUnicode_Check(unicode)) {
2854 PyErr_BadArgument();
2855 goto onError;
2856 }
2857
2858 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002860
2861 /* Encode via the codec registry */
2862 v = PyCodec_Encode(unicode, encoding, errors);
2863 if (v == NULL)
2864 goto onError;
2865 return v;
2866
Benjamin Peterson29060642009-01-31 22:14:21 +00002867 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002868 return NULL;
2869}
2870
Victor Stinnerad158722010-10-27 00:25:46 +00002871PyObject *
2872PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002873{
Victor Stinner99b95382011-07-04 14:23:54 +02002874#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002875 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2876 PyUnicode_GET_SIZE(unicode),
2877 NULL);
2878#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002879 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002880#else
Victor Stinner793b5312011-04-27 00:24:21 +02002881 PyInterpreterState *interp = PyThreadState_GET()->interp;
2882 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2883 cannot use it to encode and decode filenames before it is loaded. Load
2884 the Python codec requires to encode at least its own filename. Use the C
2885 version of the locale codec until the codec registry is initialized and
2886 the Python codec is loaded.
2887
2888 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2889 cannot only rely on it: check also interp->fscodec_initialized for
2890 subinterpreters. */
2891 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002892 return PyUnicode_AsEncodedString(unicode,
2893 Py_FileSystemDefaultEncoding,
2894 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002895 }
2896 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002897 /* locale encoding with surrogateescape */
2898 wchar_t *wchar;
2899 char *bytes;
2900 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002901 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002902
2903 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2904 if (wchar == NULL)
2905 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002906 bytes = _Py_wchar2char(wchar, &error_pos);
2907 if (bytes == NULL) {
2908 if (error_pos != (size_t)-1) {
2909 char *errmsg = strerror(errno);
2910 PyObject *exc = NULL;
2911 if (errmsg == NULL)
2912 errmsg = "Py_wchar2char() failed";
2913 raise_encode_exception(&exc,
2914 "filesystemencoding",
2915 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2916 error_pos, error_pos+1,
2917 errmsg);
2918 Py_XDECREF(exc);
2919 }
2920 else
2921 PyErr_NoMemory();
2922 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002923 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002924 }
2925 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002926
2927 bytes_obj = PyBytes_FromString(bytes);
2928 PyMem_Free(bytes);
2929 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002930 }
Victor Stinnerad158722010-10-27 00:25:46 +00002931#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 const char *encoding,
2937 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938{
2939 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002940 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002941
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 if (!PyUnicode_Check(unicode)) {
2943 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 }
Fred Drakee4315f52000-05-09 19:53:39 +00002946
Victor Stinner2f283c22011-03-02 01:21:46 +00002947 if (encoding == NULL) {
2948 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002949 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002950 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002951 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002952 }
Fred Drakee4315f52000-05-09 19:53:39 +00002953
2954 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002955 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002956 if ((strcmp(lower, "utf-8") == 0) ||
2957 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002958 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002959 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002961 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002962 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002963 }
Victor Stinner37296e82010-06-10 13:36:23 +00002964 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002965 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002966 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002967 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002968#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002969 else if (strcmp(lower, "mbcs") == 0)
2970 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2971 PyUnicode_GET_SIZE(unicode),
2972 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002973#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002974 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977
2978 /* Encode via the codec registry */
2979 v = PyCodec_Encode(unicode, encoding, errors);
2980 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002981 return NULL;
2982
2983 /* The normal path */
2984 if (PyBytes_Check(v))
2985 return v;
2986
2987 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002988 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002989 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002990 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002991
2992 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2993 "encoder %s returned bytearray instead of bytes",
2994 encoding);
2995 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002996 Py_DECREF(v);
2997 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002998 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002999
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003000 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3001 Py_DECREF(v);
3002 return b;
3003 }
3004
3005 PyErr_Format(PyExc_TypeError,
3006 "encoder did not return a bytes object (type=%.400s)",
3007 Py_TYPE(v)->tp_name);
3008 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003009 return NULL;
3010}
3011
Alexander Belopolsky40018472011-02-26 01:02:56 +00003012PyObject *
3013PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003014 const char *encoding,
3015 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003016{
3017 PyObject *v;
3018
3019 if (!PyUnicode_Check(unicode)) {
3020 PyErr_BadArgument();
3021 goto onError;
3022 }
3023
3024 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003025 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003026
3027 /* Encode via the codec registry */
3028 v = PyCodec_Encode(unicode, encoding, errors);
3029 if (v == NULL)
3030 goto onError;
3031 if (!PyUnicode_Check(v)) {
3032 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003033 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003034 Py_TYPE(v)->tp_name);
3035 Py_DECREF(v);
3036 goto onError;
3037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003039
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 return NULL;
3042}
3043
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003044PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003045PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003046 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003047 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3048}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003049
Christian Heimes5894ba72007-11-04 11:43:14 +00003050PyObject*
3051PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3052{
Victor Stinner99b95382011-07-04 14:23:54 +02003053#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003054 return PyUnicode_DecodeMBCS(s, size, NULL);
3055#elif defined(__APPLE__)
3056 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3057#else
Victor Stinner793b5312011-04-27 00:24:21 +02003058 PyInterpreterState *interp = PyThreadState_GET()->interp;
3059 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3060 cannot use it to encode and decode filenames before it is loaded. Load
3061 the Python codec requires to encode at least its own filename. Use the C
3062 version of the locale codec until the codec registry is initialized and
3063 the Python codec is loaded.
3064
3065 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3066 cannot only rely on it: check also interp->fscodec_initialized for
3067 subinterpreters. */
3068 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003069 return PyUnicode_Decode(s, size,
3070 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003071 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003072 }
3073 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003074 /* locale encoding with surrogateescape */
3075 wchar_t *wchar;
3076 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003077 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003078
3079 if (s[size] != '\0' || size != strlen(s)) {
3080 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3081 return NULL;
3082 }
3083
Victor Stinner168e1172010-10-16 23:16:16 +00003084 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003085 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003086 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003087
Victor Stinner168e1172010-10-16 23:16:16 +00003088 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003089 PyMem_Free(wchar);
3090 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003091 }
Victor Stinnerad158722010-10-27 00:25:46 +00003092#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003093}
3094
Martin v. Löwis011e8422009-05-05 04:43:17 +00003095
3096int
3097PyUnicode_FSConverter(PyObject* arg, void* addr)
3098{
3099 PyObject *output = NULL;
3100 Py_ssize_t size;
3101 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003102 if (arg == NULL) {
3103 Py_DECREF(*(PyObject**)addr);
3104 return 1;
3105 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003106 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003107 output = arg;
3108 Py_INCREF(output);
3109 }
3110 else {
3111 arg = PyUnicode_FromObject(arg);
3112 if (!arg)
3113 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003114 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003115 Py_DECREF(arg);
3116 if (!output)
3117 return 0;
3118 if (!PyBytes_Check(output)) {
3119 Py_DECREF(output);
3120 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3121 return 0;
3122 }
3123 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003124 size = PyBytes_GET_SIZE(output);
3125 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003126 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003127 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003128 Py_DECREF(output);
3129 return 0;
3130 }
3131 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003132 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003133}
3134
3135
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003136int
3137PyUnicode_FSDecoder(PyObject* arg, void* addr)
3138{
3139 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003140 if (arg == NULL) {
3141 Py_DECREF(*(PyObject**)addr);
3142 return 1;
3143 }
3144 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003145 if (PyUnicode_READY(arg))
3146 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003147 output = arg;
3148 Py_INCREF(output);
3149 }
3150 else {
3151 arg = PyBytes_FromObject(arg);
3152 if (!arg)
3153 return 0;
3154 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3155 PyBytes_GET_SIZE(arg));
3156 Py_DECREF(arg);
3157 if (!output)
3158 return 0;
3159 if (!PyUnicode_Check(output)) {
3160 Py_DECREF(output);
3161 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3162 return 0;
3163 }
3164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003165 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3166 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003167 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3168 Py_DECREF(output);
3169 return 0;
3170 }
3171 *(PyObject**)addr = output;
3172 return Py_CLEANUP_SUPPORTED;
3173}
3174
3175
Martin v. Löwis5b222132007-06-10 09:51:05 +00003176char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003177PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003178{
Christian Heimesf3863112007-11-22 07:46:41 +00003179 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003180 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3181
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003182 if (!PyUnicode_Check(unicode)) {
3183 PyErr_BadArgument();
3184 return NULL;
3185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003186 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003187 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003188
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003189 if (PyUnicode_UTF8(unicode) == NULL) {
3190 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003191 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3192 if (bytes == NULL)
3193 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003194 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3195 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003196 Py_DECREF(bytes);
3197 return NULL;
3198 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003199 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3200 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003201 Py_DECREF(bytes);
3202 }
3203
3204 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003205 *psize = PyUnicode_UTF8_LENGTH(unicode);
3206 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003207}
3208
3209char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003210PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003212 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3213}
3214
3215#ifdef Py_DEBUG
3216int unicode_as_unicode_calls = 0;
3217#endif
3218
3219
3220Py_UNICODE *
3221PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3222{
3223 PyUnicodeObject *u;
3224 const unsigned char *one_byte;
3225#if SIZEOF_WCHAR_T == 4
3226 const Py_UCS2 *two_bytes;
3227#else
3228 const Py_UCS4 *four_bytes;
3229 const Py_UCS4 *ucs4_end;
3230 Py_ssize_t num_surrogates;
3231#endif
3232 wchar_t *w;
3233 wchar_t *wchar_end;
3234
3235 if (!PyUnicode_Check(unicode)) {
3236 PyErr_BadArgument();
3237 return NULL;
3238 }
3239 u = (PyUnicodeObject*)unicode;
3240 if (_PyUnicode_WSTR(u) == NULL) {
3241 /* Non-ASCII compact unicode object */
3242 assert(_PyUnicode_KIND(u) != 0);
3243 assert(PyUnicode_IS_READY(u));
3244
3245#ifdef Py_DEBUG
3246 ++unicode_as_unicode_calls;
3247#endif
3248
3249 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3250#if SIZEOF_WCHAR_T == 2
3251 four_bytes = PyUnicode_4BYTE_DATA(u);
3252 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3253 num_surrogates = 0;
3254
3255 for (; four_bytes < ucs4_end; ++four_bytes) {
3256 if (*four_bytes > 0xFFFF)
3257 ++num_surrogates;
3258 }
3259
3260 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3261 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3262 if (!_PyUnicode_WSTR(u)) {
3263 PyErr_NoMemory();
3264 return NULL;
3265 }
3266 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3267
3268 w = _PyUnicode_WSTR(u);
3269 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3270 four_bytes = PyUnicode_4BYTE_DATA(u);
3271 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3272 if (*four_bytes > 0xFFFF) {
3273 /* encode surrogate pair in this case */
3274 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3275 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3276 }
3277 else
3278 *w = *four_bytes;
3279
3280 if (w > wchar_end) {
3281 assert(0 && "Miscalculated string end");
3282 }
3283 }
3284 *w = 0;
3285#else
3286 /* sizeof(wchar_t) == 4 */
3287 Py_FatalError("Impossible unicode object state, wstr and str "
3288 "should share memory already.");
3289 return NULL;
3290#endif
3291 }
3292 else {
3293 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3294 (_PyUnicode_LENGTH(u) + 1));
3295 if (!_PyUnicode_WSTR(u)) {
3296 PyErr_NoMemory();
3297 return NULL;
3298 }
3299 if (!PyUnicode_IS_COMPACT_ASCII(u))
3300 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3301 w = _PyUnicode_WSTR(u);
3302 wchar_end = w + _PyUnicode_LENGTH(u);
3303
3304 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3305 one_byte = PyUnicode_1BYTE_DATA(u);
3306 for (; w < wchar_end; ++one_byte, ++w)
3307 *w = *one_byte;
3308 /* null-terminate the wstr */
3309 *w = 0;
3310 }
3311 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3312#if SIZEOF_WCHAR_T == 4
3313 two_bytes = PyUnicode_2BYTE_DATA(u);
3314 for (; w < wchar_end; ++two_bytes, ++w)
3315 *w = *two_bytes;
3316 /* null-terminate the wstr */
3317 *w = 0;
3318#else
3319 /* sizeof(wchar_t) == 2 */
3320 PyObject_FREE(_PyUnicode_WSTR(u));
3321 _PyUnicode_WSTR(u) = NULL;
3322 Py_FatalError("Impossible unicode object state, wstr "
3323 "and str should share memory already.");
3324 return NULL;
3325#endif
3326 }
3327 else {
3328 assert(0 && "This should never happen.");
3329 }
3330 }
3331 }
3332 if (size != NULL)
3333 *size = PyUnicode_WSTR_LENGTH(u);
3334 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003335}
3336
Alexander Belopolsky40018472011-02-26 01:02:56 +00003337Py_UNICODE *
3338PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003340 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341}
3342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343
Alexander Belopolsky40018472011-02-26 01:02:56 +00003344Py_ssize_t
3345PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346{
3347 if (!PyUnicode_Check(unicode)) {
3348 PyErr_BadArgument();
3349 goto onError;
3350 }
3351 return PyUnicode_GET_SIZE(unicode);
3352
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 return -1;
3355}
3356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357Py_ssize_t
3358PyUnicode_GetLength(PyObject *unicode)
3359{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003360 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003361 PyErr_BadArgument();
3362 return -1;
3363 }
3364
3365 return PyUnicode_GET_LENGTH(unicode);
3366}
3367
3368Py_UCS4
3369PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3370{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003371 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3372 PyErr_BadArgument();
3373 return (Py_UCS4)-1;
3374 }
3375 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3376 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003377 return (Py_UCS4)-1;
3378 }
3379 return PyUnicode_READ_CHAR(unicode, index);
3380}
3381
3382int
3383PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3384{
3385 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003386 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387 return -1;
3388 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003389 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3390 PyErr_SetString(PyExc_IndexError, "string index out of range");
3391 return -1;
3392 }
3393 if (_PyUnicode_Dirty(unicode))
3394 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3396 index, ch);
3397 return 0;
3398}
3399
Alexander Belopolsky40018472011-02-26 01:02:56 +00003400const char *
3401PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003402{
Victor Stinner42cb4622010-09-01 19:39:01 +00003403 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003404}
3405
Victor Stinner554f3f02010-06-16 23:33:54 +00003406/* create or adjust a UnicodeDecodeError */
3407static void
3408make_decode_exception(PyObject **exceptionObject,
3409 const char *encoding,
3410 const char *input, Py_ssize_t length,
3411 Py_ssize_t startpos, Py_ssize_t endpos,
3412 const char *reason)
3413{
3414 if (*exceptionObject == NULL) {
3415 *exceptionObject = PyUnicodeDecodeError_Create(
3416 encoding, input, length, startpos, endpos, reason);
3417 }
3418 else {
3419 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3420 goto onError;
3421 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3422 goto onError;
3423 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3424 goto onError;
3425 }
3426 return;
3427
3428onError:
3429 Py_DECREF(*exceptionObject);
3430 *exceptionObject = NULL;
3431}
3432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433/* error handling callback helper:
3434 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003435 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 and adjust various state variables.
3437 return 0 on success, -1 on error
3438*/
3439
Alexander Belopolsky40018472011-02-26 01:02:56 +00003440static int
3441unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003442 const char *encoding, const char *reason,
3443 const char **input, const char **inend, Py_ssize_t *startinpos,
3444 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3445 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003447 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448
3449 PyObject *restuple = NULL;
3450 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003451 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003452 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003453 Py_ssize_t requiredsize;
3454 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003455 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003456 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003457 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458 int res = -1;
3459
3460 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003461 *errorHandler = PyCodec_LookupError(errors);
3462 if (*errorHandler == NULL)
3463 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 }
3465
Victor Stinner554f3f02010-06-16 23:33:54 +00003466 make_decode_exception(exceptionObject,
3467 encoding,
3468 *input, *inend - *input,
3469 *startinpos, *endinpos,
3470 reason);
3471 if (*exceptionObject == NULL)
3472 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473
3474 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3475 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003476 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003478 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 }
3481 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003483
3484 /* Copy back the bytes variables, which might have been modified by the
3485 callback */
3486 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3487 if (!inputobj)
3488 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003489 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003490 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003491 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003492 *input = PyBytes_AS_STRING(inputobj);
3493 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003494 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003495 /* we can DECREF safely, as the exception has another reference,
3496 so the object won't go away. */
3497 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003500 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003501 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003502 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3503 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003504 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505
3506 /* need more space? (at least enough for what we
3507 have+the replacement+the rest of the string (starting
3508 at the new input position), so we won't have to check space
3509 when there are no errors in the rest of the string) */
3510 repptr = PyUnicode_AS_UNICODE(repunicode);
3511 repsize = PyUnicode_GET_SIZE(repunicode);
3512 requiredsize = *outpos + repsize + insize-newpos;
3513 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003514 if (requiredsize<2*outsize)
3515 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003516 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003517 goto onError;
3518 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 }
3520 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003521 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 Py_UNICODE_COPY(*outptr, repptr, repsize);
3523 *outptr += repsize;
3524 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 /* we made it! */
3527 res = 0;
3528
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 Py_XDECREF(restuple);
3531 return res;
3532}
3533
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003534/* --- UTF-7 Codec -------------------------------------------------------- */
3535
Antoine Pitrou244651a2009-05-04 18:56:13 +00003536/* See RFC2152 for details. We encode conservatively and decode liberally. */
3537
3538/* Three simple macros defining base-64. */
3539
3540/* Is c a base-64 character? */
3541
3542#define IS_BASE64(c) \
3543 (((c) >= 'A' && (c) <= 'Z') || \
3544 ((c) >= 'a' && (c) <= 'z') || \
3545 ((c) >= '0' && (c) <= '9') || \
3546 (c) == '+' || (c) == '/')
3547
3548/* given that c is a base-64 character, what is its base-64 value? */
3549
3550#define FROM_BASE64(c) \
3551 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3552 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3553 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3554 (c) == '+' ? 62 : 63)
3555
3556/* What is the base-64 character of the bottom 6 bits of n? */
3557
3558#define TO_BASE64(n) \
3559 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3560
3561/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3562 * decoded as itself. We are permissive on decoding; the only ASCII
3563 * byte not decoding to itself is the + which begins a base64
3564 * string. */
3565
3566#define DECODE_DIRECT(c) \
3567 ((c) <= 127 && (c) != '+')
3568
3569/* The UTF-7 encoder treats ASCII characters differently according to
3570 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3571 * the above). See RFC2152. This array identifies these different
3572 * sets:
3573 * 0 : "Set D"
3574 * alphanumeric and '(),-./:?
3575 * 1 : "Set O"
3576 * !"#$%&*;<=>@[]^_`{|}
3577 * 2 : "whitespace"
3578 * ht nl cr sp
3579 * 3 : special (must be base64 encoded)
3580 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3581 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003582
Tim Petersced69f82003-09-16 20:30:58 +00003583static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003584char utf7_category[128] = {
3585/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3586 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3587/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3588 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3589/* sp ! " # $ % & ' ( ) * + , - . / */
3590 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3591/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3593/* @ A B C D E F G H I J K L M N O */
3594 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3595/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3597/* ` a b c d e f g h i j k l m n o */
3598 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3599/* p q r s t u v w x y z { | } ~ del */
3600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003601};
3602
Antoine Pitrou244651a2009-05-04 18:56:13 +00003603/* ENCODE_DIRECT: this character should be encoded as itself. The
3604 * answer depends on whether we are encoding set O as itself, and also
3605 * on whether we are encoding whitespace as itself. RFC2152 makes it
3606 * clear that the answers to these questions vary between
3607 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003608
Antoine Pitrou244651a2009-05-04 18:56:13 +00003609#define ENCODE_DIRECT(c, directO, directWS) \
3610 ((c) < 128 && (c) > 0 && \
3611 ((utf7_category[(c)] == 0) || \
3612 (directWS && (utf7_category[(c)] == 2)) || \
3613 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003614
Alexander Belopolsky40018472011-02-26 01:02:56 +00003615PyObject *
3616PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003617 Py_ssize_t size,
3618 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003619{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003620 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3621}
3622
Antoine Pitrou244651a2009-05-04 18:56:13 +00003623/* The decoder. The only state we preserve is our read position,
3624 * i.e. how many characters we have consumed. So if we end in the
3625 * middle of a shift sequence we have to back off the read position
3626 * and the output to the beginning of the sequence, otherwise we lose
3627 * all the shift state (seen bits, number of bits seen, high
3628 * surrogate). */
3629
Alexander Belopolsky40018472011-02-26 01:02:56 +00003630PyObject *
3631PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003632 Py_ssize_t size,
3633 const char *errors,
3634 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003635{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003637 Py_ssize_t startinpos;
3638 Py_ssize_t endinpos;
3639 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003640 const char *e;
3641 PyUnicodeObject *unicode;
3642 Py_UNICODE *p;
3643 const char *errmsg = "";
3644 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003645 Py_UNICODE *shiftOutStart;
3646 unsigned int base64bits = 0;
3647 unsigned long base64buffer = 0;
3648 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 PyObject *errorHandler = NULL;
3650 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003651
3652 unicode = _PyUnicode_New(size);
3653 if (!unicode)
3654 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003655 if (size == 0) {
3656 if (consumed)
3657 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003658 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003659 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003662 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003663 e = s + size;
3664
3665 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003668 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003669
Antoine Pitrou244651a2009-05-04 18:56:13 +00003670 if (inShift) { /* in a base-64 section */
3671 if (IS_BASE64(ch)) { /* consume a base-64 character */
3672 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3673 base64bits += 6;
3674 s++;
3675 if (base64bits >= 16) {
3676 /* we have enough bits for a UTF-16 value */
3677 Py_UNICODE outCh = (Py_UNICODE)
3678 (base64buffer >> (base64bits-16));
3679 base64bits -= 16;
3680 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3681 if (surrogate) {
3682 /* expecting a second surrogate */
3683 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3684#ifdef Py_UNICODE_WIDE
3685 *p++ = (((surrogate & 0x3FF)<<10)
3686 | (outCh & 0x3FF)) + 0x10000;
3687#else
3688 *p++ = surrogate;
3689 *p++ = outCh;
3690#endif
3691 surrogate = 0;
3692 }
3693 else {
3694 surrogate = 0;
3695 errmsg = "second surrogate missing";
3696 goto utf7Error;
3697 }
3698 }
3699 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3700 /* first surrogate */
3701 surrogate = outCh;
3702 }
3703 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3704 errmsg = "unexpected second surrogate";
3705 goto utf7Error;
3706 }
3707 else {
3708 *p++ = outCh;
3709 }
3710 }
3711 }
3712 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003713 inShift = 0;
3714 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003715 if (surrogate) {
3716 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003717 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003718 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003719 if (base64bits > 0) { /* left-over bits */
3720 if (base64bits >= 6) {
3721 /* We've seen at least one base-64 character */
3722 errmsg = "partial character in shift sequence";
3723 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003724 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003725 else {
3726 /* Some bits remain; they should be zero */
3727 if (base64buffer != 0) {
3728 errmsg = "non-zero padding bits in shift sequence";
3729 goto utf7Error;
3730 }
3731 }
3732 }
3733 if (ch != '-') {
3734 /* '-' is absorbed; other terminating
3735 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003736 *p++ = ch;
3737 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738 }
3739 }
3740 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003742 s++; /* consume '+' */
3743 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003744 s++;
3745 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003746 }
3747 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003748 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003749 shiftOutStart = p;
3750 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003751 }
3752 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003753 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003754 *p++ = ch;
3755 s++;
3756 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003757 else {
3758 startinpos = s-starts;
3759 s++;
3760 errmsg = "unexpected special character";
3761 goto utf7Error;
3762 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003763 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003764utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 outpos = p-PyUnicode_AS_UNICODE(unicode);
3766 endinpos = s-starts;
3767 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 errors, &errorHandler,
3769 "utf7", errmsg,
3770 &starts, &e, &startinpos, &endinpos, &exc, &s,
3771 &unicode, &outpos, &p))
3772 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003773 }
3774
Antoine Pitrou244651a2009-05-04 18:56:13 +00003775 /* end of string */
3776
3777 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3778 /* if we're in an inconsistent state, that's an error */
3779 if (surrogate ||
3780 (base64bits >= 6) ||
3781 (base64bits > 0 && base64buffer != 0)) {
3782 outpos = p-PyUnicode_AS_UNICODE(unicode);
3783 endinpos = size;
3784 if (unicode_decode_call_errorhandler(
3785 errors, &errorHandler,
3786 "utf7", "unterminated shift sequence",
3787 &starts, &e, &startinpos, &endinpos, &exc, &s,
3788 &unicode, &outpos, &p))
3789 goto onError;
3790 if (s < e)
3791 goto restart;
3792 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003793 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003794
3795 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003796 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003797 if (inShift) {
3798 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003799 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003800 }
3801 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003802 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003803 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003804 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003805
Victor Stinnerfe226c02011-10-03 03:52:20 +02003806 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003807 goto onError;
3808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 Py_XDECREF(errorHandler);
3810 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003811#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003812 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813 Py_DECREF(unicode);
3814 return NULL;
3815 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003816#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003817 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003818 return (PyObject *)unicode;
3819
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 Py_XDECREF(errorHandler);
3822 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003823 Py_DECREF(unicode);
3824 return NULL;
3825}
3826
3827
Alexander Belopolsky40018472011-02-26 01:02:56 +00003828PyObject *
3829PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003830 Py_ssize_t size,
3831 int base64SetO,
3832 int base64WhiteSpace,
3833 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003834{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003835 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003836 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003837 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003838 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003839 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003840 unsigned int base64bits = 0;
3841 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003842 char * out;
3843 char * start;
3844
3845 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003846 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003847
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003848 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003849 return PyErr_NoMemory();
3850
Antoine Pitrou244651a2009-05-04 18:56:13 +00003851 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852 if (v == NULL)
3853 return NULL;
3854
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003855 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003856 for (;i < size; ++i) {
3857 Py_UNICODE ch = s[i];
3858
Antoine Pitrou244651a2009-05-04 18:56:13 +00003859 if (inShift) {
3860 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3861 /* shifting out */
3862 if (base64bits) { /* output remaining bits */
3863 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3864 base64buffer = 0;
3865 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003866 }
3867 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003868 /* Characters not in the BASE64 set implicitly unshift the sequence
3869 so no '-' is required, except if the character is itself a '-' */
3870 if (IS_BASE64(ch) || ch == '-') {
3871 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003872 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003873 *out++ = (char) ch;
3874 }
3875 else {
3876 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003877 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003879 else { /* not in a shift sequence */
3880 if (ch == '+') {
3881 *out++ = '+';
3882 *out++ = '-';
3883 }
3884 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3885 *out++ = (char) ch;
3886 }
3887 else {
3888 *out++ = '+';
3889 inShift = 1;
3890 goto encode_char;
3891 }
3892 }
3893 continue;
3894encode_char:
3895#ifdef Py_UNICODE_WIDE
3896 if (ch >= 0x10000) {
3897 /* code first surrogate */
3898 base64bits += 16;
3899 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3900 while (base64bits >= 6) {
3901 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3902 base64bits -= 6;
3903 }
3904 /* prepare second surrogate */
3905 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3906 }
3907#endif
3908 base64bits += 16;
3909 base64buffer = (base64buffer << 16) | ch;
3910 while (base64bits >= 6) {
3911 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3912 base64bits -= 6;
3913 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003914 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 if (base64bits)
3916 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3917 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003918 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003919 if (_PyBytes_Resize(&v, out - start) < 0)
3920 return NULL;
3921 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003922}
3923
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924#undef IS_BASE64
3925#undef FROM_BASE64
3926#undef TO_BASE64
3927#undef DECODE_DIRECT
3928#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003929
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930/* --- UTF-8 Codec -------------------------------------------------------- */
3931
Tim Petersced69f82003-09-16 20:30:58 +00003932static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003934 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3935 illegal prefix. See RFC 3629 for details */
3936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3946 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003947 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3948 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3949 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3950 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3951 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952};
3953
Alexander Belopolsky40018472011-02-26 01:02:56 +00003954PyObject *
3955PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003956 Py_ssize_t size,
3957 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958{
Walter Dörwald69652032004-09-07 20:24:22 +00003959 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3960}
3961
Antoine Pitrouab868312009-01-10 15:40:25 +00003962/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3963#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3964
3965/* Mask to quickly check whether a C 'long' contains a
3966 non-ASCII, UTF8-encoded char. */
3967#if (SIZEOF_LONG == 8)
3968# define ASCII_CHAR_MASK 0x8080808080808080L
3969#elif (SIZEOF_LONG == 4)
3970# define ASCII_CHAR_MASK 0x80808080L
3971#else
3972# error C 'long' size should be either 4 or 8!
3973#endif
3974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975/* Scans a UTF-8 string and returns the maximum character to be expected,
3976 the size of the decoded unicode string and if any major errors were
3977 encountered.
3978
3979 This function does check basic UTF-8 sanity, it does however NOT CHECK
3980 if the string contains surrogates, and if all continuation bytes are
3981 within the correct ranges, these checks are performed in
3982 PyUnicode_DecodeUTF8Stateful.
3983
3984 If it sets has_errors to 1, it means the value of unicode_size and max_char
3985 will be bogus and you should not rely on useful information in them.
3986 */
3987static Py_UCS4
3988utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3989 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3990 int *has_errors)
3991{
3992 Py_ssize_t n;
3993 Py_ssize_t char_count = 0;
3994 Py_UCS4 max_char = 127, new_max;
3995 Py_UCS4 upper_bound;
3996 const unsigned char *p = (const unsigned char *)s;
3997 const unsigned char *end = p + string_size;
3998 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3999 int err = 0;
4000
4001 for (; p < end && !err; ++p, ++char_count) {
4002 /* Only check value if it's not a ASCII char... */
4003 if (*p < 0x80) {
4004 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4005 an explanation. */
4006 if (!((size_t) p & LONG_PTR_MASK)) {
4007 /* Help register allocation */
4008 register const unsigned char *_p = p;
4009 while (_p < aligned_end) {
4010 unsigned long value = *(unsigned long *) _p;
4011 if (value & ASCII_CHAR_MASK)
4012 break;
4013 _p += SIZEOF_LONG;
4014 char_count += SIZEOF_LONG;
4015 }
4016 p = _p;
4017 if (p == end)
4018 break;
4019 }
4020 }
4021 if (*p >= 0x80) {
4022 n = utf8_code_length[*p];
4023 new_max = max_char;
4024 switch (n) {
4025 /* invalid start byte */
4026 case 0:
4027 err = 1;
4028 break;
4029 case 2:
4030 /* Code points between 0x00FF and 0x07FF inclusive.
4031 Approximate the upper bound of the code point,
4032 if this flips over 255 we can be sure it will be more
4033 than 255 and the string will need 2 bytes per code coint,
4034 if it stays under or equal to 255, we can be sure 1 byte
4035 is enough.
4036 ((*p & 0b00011111) << 6) | 0b00111111 */
4037 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4038 if (max_char < upper_bound)
4039 new_max = upper_bound;
4040 /* Ensure we track at least that we left ASCII space. */
4041 if (new_max < 128)
4042 new_max = 128;
4043 break;
4044 case 3:
4045 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4046 always > 255 and <= 65535 and will always need 2 bytes. */
4047 if (max_char < 65535)
4048 new_max = 65535;
4049 break;
4050 case 4:
4051 /* Code point will be above 0xFFFF for sure in this case. */
4052 new_max = 65537;
4053 break;
4054 /* Internal error, this should be caught by the first if */
4055 case 1:
4056 default:
4057 assert(0 && "Impossible case in utf8_max_char_and_size");
4058 err = 1;
4059 }
4060 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004061 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 --n;
4063 /* Check if the follow up chars are all valid continuation bytes */
4064 if (n >= 1) {
4065 const unsigned char *cont;
4066 if ((p + n) >= end) {
4067 if (consumed == 0)
4068 /* incomplete data, non-incremental decoding */
4069 err = 1;
4070 break;
4071 }
4072 for (cont = p + 1; cont < (p + n); ++cont) {
4073 if ((*cont & 0xc0) != 0x80) {
4074 err = 1;
4075 break;
4076 }
4077 }
4078 p += n;
4079 }
4080 else
4081 err = 1;
4082 max_char = new_max;
4083 }
4084 }
4085
4086 if (unicode_size)
4087 *unicode_size = char_count;
4088 if (has_errors)
4089 *has_errors = err;
4090 return max_char;
4091}
4092
4093/* Similar to PyUnicode_WRITE but can also write into wstr field
4094 of the legacy unicode representation */
4095#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4096 do { \
4097 const int k_ = (kind); \
4098 if (k_ == PyUnicode_WCHAR_KIND) \
4099 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4100 else if (k_ == PyUnicode_1BYTE_KIND) \
4101 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4102 else if (k_ == PyUnicode_2BYTE_KIND) \
4103 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4104 else \
4105 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4106 } while (0)
4107
Alexander Belopolsky40018472011-02-26 01:02:56 +00004108PyObject *
4109PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 Py_ssize_t size,
4111 const char *errors,
4112 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004113{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004116 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004117 Py_ssize_t startinpos;
4118 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004119 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004121 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 PyObject *errorHandler = NULL;
4123 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004124 Py_UCS4 maxchar = 0;
4125 Py_ssize_t unicode_size;
4126 Py_ssize_t i;
4127 int kind;
4128 void *data;
4129 int has_errors;
4130 Py_UNICODE *error_outptr;
4131#if SIZEOF_WCHAR_T == 2
4132 Py_ssize_t wchar_offset = 0;
4133#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134
Walter Dörwald69652032004-09-07 20:24:22 +00004135 if (size == 0) {
4136 if (consumed)
4137 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4141 consumed, &has_errors);
4142 if (has_errors) {
4143 unicode = _PyUnicode_New(size);
4144 if (!unicode)
4145 return NULL;
4146 kind = PyUnicode_WCHAR_KIND;
4147 data = PyUnicode_AS_UNICODE(unicode);
4148 assert(data != NULL);
4149 }
4150 else {
4151 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4152 if (!unicode)
4153 return NULL;
4154 /* When the string is ASCII only, just use memcpy and return.
4155 unicode_size may be != size if there is an incomplete UTF-8
4156 sequence at the end of the ASCII block. */
4157 if (maxchar < 128 && size == unicode_size) {
4158 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4159 return (PyObject *)unicode;
4160 }
4161 kind = PyUnicode_KIND(unicode);
4162 data = PyUnicode_DATA(unicode);
4163 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004167 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168
4169 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004170 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171
4172 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004173 /* Fast path for runs of ASCII characters. Given that common UTF-8
4174 input will consist of an overwhelming majority of ASCII
4175 characters, we try to optimize for this case by checking
4176 as many characters as a C 'long' can contain.
4177 First, check if we can do an aligned read, as most CPUs have
4178 a penalty for unaligned reads.
4179 */
4180 if (!((size_t) s & LONG_PTR_MASK)) {
4181 /* Help register allocation */
4182 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004183 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004184 while (_s < aligned_end) {
4185 /* Read a whole long at a time (either 4 or 8 bytes),
4186 and do a fast unrolled copy if it only contains ASCII
4187 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004188 unsigned long value = *(unsigned long *) _s;
4189 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004190 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004191 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4192 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4193 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4194 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004195#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4197 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4198 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4199 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004200#endif
4201 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004202 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004203 }
4204 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004206 if (s == e)
4207 break;
4208 ch = (unsigned char)*s;
4209 }
4210 }
4211
4212 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 s++;
4215 continue;
4216 }
4217
4218 n = utf8_code_length[ch];
4219
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004220 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 if (consumed)
4222 break;
4223 else {
4224 errmsg = "unexpected end of data";
4225 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004226 endinpos = startinpos+1;
4227 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4228 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 goto utf8Error;
4230 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232
4233 switch (n) {
4234
4235 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004236 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 startinpos = s-starts;
4238 endinpos = startinpos+1;
4239 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240
4241 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004242 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 startinpos = s-starts;
4244 endinpos = startinpos+1;
4245 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246
4247 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004248 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004249 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004251 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 goto utf8Error;
4253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004255 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004256 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 break;
4258
4259 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004260 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4261 will result in surrogates in range d800-dfff. Surrogates are
4262 not valid UTF-8 so they are rejected.
4263 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4264 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004265 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004266 (s[2] & 0xc0) != 0x80 ||
4267 ((unsigned char)s[0] == 0xE0 &&
4268 (unsigned char)s[1] < 0xA0) ||
4269 ((unsigned char)s[0] == 0xED &&
4270 (unsigned char)s[1] > 0x9F)) {
4271 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004273 endinpos = startinpos + 1;
4274
4275 /* if s[1] first two bits are 1 and 0, then the invalid
4276 continuation byte is s[2], so increment endinpos by 1,
4277 if not, s[1] is invalid and endinpos doesn't need to
4278 be incremented. */
4279 if ((s[1] & 0xC0) == 0x80)
4280 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 goto utf8Error;
4282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004284 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004285 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004286 break;
4287
4288 case 4:
4289 if ((s[1] & 0xc0) != 0x80 ||
4290 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004291 (s[3] & 0xc0) != 0x80 ||
4292 ((unsigned char)s[0] == 0xF0 &&
4293 (unsigned char)s[1] < 0x90) ||
4294 ((unsigned char)s[0] == 0xF4 &&
4295 (unsigned char)s[1] > 0x8F)) {
4296 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004298 endinpos = startinpos + 1;
4299 if ((s[1] & 0xC0) == 0x80) {
4300 endinpos++;
4301 if ((s[2] & 0xC0) == 0x80)
4302 endinpos++;
4303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 goto utf8Error;
4305 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004306 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004307 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4308 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004310 /* If the string is flexible or we have native UCS-4, write
4311 directly.. */
4312 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4313 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004315 else {
4316 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004318 /* translate from 10000..10FFFF to 0..FFFF */
4319 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004321 /* high surrogate = top 10 bits added to D800 */
4322 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4323 (Py_UNICODE)(0xD800 + (ch >> 10)));
4324
4325 /* low surrogate = bottom 10 bits added to DC00 */
4326 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4327 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4328 }
4329#if SIZEOF_WCHAR_T == 2
4330 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004331#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 }
4334 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004336
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004338 /* If this is not yet a resizable string, make it one.. */
4339 if (kind != PyUnicode_WCHAR_KIND) {
4340 const Py_UNICODE *u;
4341 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4342 if (!new_unicode)
4343 goto onError;
4344 u = PyUnicode_AsUnicode((PyObject *)unicode);
4345 if (!u)
4346 goto onError;
4347#if SIZEOF_WCHAR_T == 2
4348 i += wchar_offset;
4349#endif
4350 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4351 Py_DECREF(unicode);
4352 unicode = new_unicode;
4353 kind = 0;
4354 data = PyUnicode_AS_UNICODE(new_unicode);
4355 assert(data != NULL);
4356 }
4357 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 if (unicode_decode_call_errorhandler(
4359 errors, &errorHandler,
4360 "utf8", errmsg,
4361 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364 /* Update data because unicode_decode_call_errorhandler might have
4365 re-created or resized the unicode object. */
4366 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 /* Ensure the unicode_size calculation above was correct: */
4370 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4371
Walter Dörwald69652032004-09-07 20:24:22 +00004372 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004375 /* Adjust length and ready string when it contained errors and
4376 is of the old resizable kind. */
4377 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004378 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004379 goto onError;
4380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 Py_XDECREF(errorHandler);
4383 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004384#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004385 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004386 Py_DECREF(unicode);
4387 return NULL;
4388 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004389#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004390 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 return (PyObject *)unicode;
4392
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 Py_XDECREF(errorHandler);
4395 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 Py_DECREF(unicode);
4397 return NULL;
4398}
4399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004400#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004401
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004402#ifdef __APPLE__
4403
4404/* Simplified UTF-8 decoder using surrogateescape error handler,
4405 used to decode the command line arguments on Mac OS X. */
4406
4407wchar_t*
4408_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4409{
4410 int n;
4411 const char *e;
4412 wchar_t *unicode, *p;
4413
4414 /* Note: size will always be longer than the resulting Unicode
4415 character count */
4416 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4417 PyErr_NoMemory();
4418 return NULL;
4419 }
4420 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4421 if (!unicode)
4422 return NULL;
4423
4424 /* Unpack UTF-8 encoded data */
4425 p = unicode;
4426 e = s + size;
4427 while (s < e) {
4428 Py_UCS4 ch = (unsigned char)*s;
4429
4430 if (ch < 0x80) {
4431 *p++ = (wchar_t)ch;
4432 s++;
4433 continue;
4434 }
4435
4436 n = utf8_code_length[ch];
4437 if (s + n > e) {
4438 goto surrogateescape;
4439 }
4440
4441 switch (n) {
4442 case 0:
4443 case 1:
4444 goto surrogateescape;
4445
4446 case 2:
4447 if ((s[1] & 0xc0) != 0x80)
4448 goto surrogateescape;
4449 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4450 assert ((ch > 0x007F) && (ch <= 0x07FF));
4451 *p++ = (wchar_t)ch;
4452 break;
4453
4454 case 3:
4455 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4456 will result in surrogates in range d800-dfff. Surrogates are
4457 not valid UTF-8 so they are rejected.
4458 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4459 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4460 if ((s[1] & 0xc0) != 0x80 ||
4461 (s[2] & 0xc0) != 0x80 ||
4462 ((unsigned char)s[0] == 0xE0 &&
4463 (unsigned char)s[1] < 0xA0) ||
4464 ((unsigned char)s[0] == 0xED &&
4465 (unsigned char)s[1] > 0x9F)) {
4466
4467 goto surrogateescape;
4468 }
4469 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4470 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004471 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004472 break;
4473
4474 case 4:
4475 if ((s[1] & 0xc0) != 0x80 ||
4476 (s[2] & 0xc0) != 0x80 ||
4477 (s[3] & 0xc0) != 0x80 ||
4478 ((unsigned char)s[0] == 0xF0 &&
4479 (unsigned char)s[1] < 0x90) ||
4480 ((unsigned char)s[0] == 0xF4 &&
4481 (unsigned char)s[1] > 0x8F)) {
4482 goto surrogateescape;
4483 }
4484 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4485 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4486 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4487
4488#if SIZEOF_WCHAR_T == 4
4489 *p++ = (wchar_t)ch;
4490#else
4491 /* compute and append the two surrogates: */
4492
4493 /* translate from 10000..10FFFF to 0..FFFF */
4494 ch -= 0x10000;
4495
4496 /* high surrogate = top 10 bits added to D800 */
4497 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4498
4499 /* low surrogate = bottom 10 bits added to DC00 */
4500 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4501#endif
4502 break;
4503 }
4504 s += n;
4505 continue;
4506
4507 surrogateescape:
4508 *p++ = 0xDC00 + ch;
4509 s++;
4510 }
4511 *p = L'\0';
4512 return unicode;
4513}
4514
4515#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004517/* Primary internal function which creates utf8 encoded bytes objects.
4518
4519 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004520 and allocate exactly as much space needed at the end. Else allocate the
4521 maximum possible needed (4 result bytes per Unicode character), and return
4522 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004523*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004524PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004525_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526{
Tim Peters602f7402002-04-27 18:03:26 +00004527#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004528
Guido van Rossum98297ee2007-11-06 21:34:58 +00004529 Py_ssize_t i; /* index into s of next input byte */
4530 PyObject *result; /* result string object */
4531 char *p; /* next free byte in output buffer */
4532 Py_ssize_t nallocated; /* number of result bytes allocated */
4533 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004534 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004535 PyObject *errorHandler = NULL;
4536 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004537 int kind;
4538 void *data;
4539 Py_ssize_t size;
4540 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4541#if SIZEOF_WCHAR_T == 2
4542 Py_ssize_t wchar_offset = 0;
4543#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004545 if (!PyUnicode_Check(unicode)) {
4546 PyErr_BadArgument();
4547 return NULL;
4548 }
4549
4550 if (PyUnicode_READY(unicode) == -1)
4551 return NULL;
4552
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004553 if (PyUnicode_UTF8(unicode))
4554 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4555 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004556
4557 kind = PyUnicode_KIND(unicode);
4558 data = PyUnicode_DATA(unicode);
4559 size = PyUnicode_GET_LENGTH(unicode);
4560
Tim Peters602f7402002-04-27 18:03:26 +00004561 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562
Tim Peters602f7402002-04-27 18:03:26 +00004563 if (size <= MAX_SHORT_UNICHARS) {
4564 /* Write into the stack buffer; nallocated can't overflow.
4565 * At the end, we'll allocate exactly as much heap space as it
4566 * turns out we need.
4567 */
4568 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004569 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004570 p = stackbuf;
4571 }
4572 else {
4573 /* Overallocate on the heap, and give the excess back at the end. */
4574 nallocated = size * 4;
4575 if (nallocated / 4 != size) /* overflow! */
4576 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004577 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004578 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004579 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004580 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004581 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004582
Tim Peters602f7402002-04-27 18:03:26 +00004583 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004584 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004585
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004586 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004587 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004589
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004591 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004592 *p++ = (char)(0xc0 | (ch >> 6));
4593 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004594 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004595 Py_ssize_t newpos;
4596 PyObject *rep;
4597 Py_ssize_t repsize, k, startpos;
4598 startpos = i-1;
4599#if SIZEOF_WCHAR_T == 2
4600 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004601#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004602 rep = unicode_encode_call_errorhandler(
4603 errors, &errorHandler, "utf-8", "surrogates not allowed",
4604 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4605 &exc, startpos, startpos+1, &newpos);
4606 if (!rep)
4607 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004609 if (PyBytes_Check(rep))
4610 repsize = PyBytes_GET_SIZE(rep);
4611 else
4612 repsize = PyUnicode_GET_SIZE(rep);
4613
4614 if (repsize > 4) {
4615 Py_ssize_t offset;
4616
4617 if (result == NULL)
4618 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004619 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004620 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004622 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4623 /* integer overflow */
4624 PyErr_NoMemory();
4625 goto error;
4626 }
4627 nallocated += repsize - 4;
4628 if (result != NULL) {
4629 if (_PyBytes_Resize(&result, nallocated) < 0)
4630 goto error;
4631 } else {
4632 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004633 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004634 goto error;
4635 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4636 }
4637 p = PyBytes_AS_STRING(result) + offset;
4638 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004640 if (PyBytes_Check(rep)) {
4641 char *prep = PyBytes_AS_STRING(rep);
4642 for(k = repsize; k > 0; k--)
4643 *p++ = *prep++;
4644 } else /* rep is unicode */ {
4645 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4646 Py_UNICODE c;
4647
4648 for(k=0; k<repsize; k++) {
4649 c = prep[k];
4650 if (0x80 <= c) {
4651 raise_encode_exception(&exc, "utf-8",
4652 PyUnicode_AS_UNICODE(unicode),
4653 size, i-1, i,
4654 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004655 goto error;
4656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004657 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004658 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004660 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004661 } else if (ch < 0x10000) {
4662 *p++ = (char)(0xe0 | (ch >> 12));
4663 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4664 *p++ = (char)(0x80 | (ch & 0x3f));
4665 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004666 /* Encode UCS4 Unicode ordinals */
4667 *p++ = (char)(0xf0 | (ch >> 18));
4668 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4669 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4670 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004671#if SIZEOF_WCHAR_T == 2
4672 wchar_offset++;
4673#endif
Tim Peters602f7402002-04-27 18:03:26 +00004674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004676
Guido van Rossum98297ee2007-11-06 21:34:58 +00004677 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004678 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004679 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004680 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004681 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004682 }
4683 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004684 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004685 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004686 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004687 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004688 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004689
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004690 Py_XDECREF(errorHandler);
4691 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004692 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004693 error:
4694 Py_XDECREF(errorHandler);
4695 Py_XDECREF(exc);
4696 Py_XDECREF(result);
4697 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004698
Tim Peters602f7402002-04-27 18:03:26 +00004699#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700}
4701
Alexander Belopolsky40018472011-02-26 01:02:56 +00004702PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004703PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4704 Py_ssize_t size,
4705 const char *errors)
4706{
4707 PyObject *v, *unicode;
4708
4709 unicode = PyUnicode_FromUnicode(s, size);
4710 if (unicode == NULL)
4711 return NULL;
4712 v = _PyUnicode_AsUTF8String(unicode, errors);
4713 Py_DECREF(unicode);
4714 return v;
4715}
4716
4717PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004718PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004720 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721}
4722
Walter Dörwald41980ca2007-08-16 21:55:45 +00004723/* --- UTF-32 Codec ------------------------------------------------------- */
4724
4725PyObject *
4726PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 Py_ssize_t size,
4728 const char *errors,
4729 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004730{
4731 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4732}
4733
4734PyObject *
4735PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 Py_ssize_t size,
4737 const char *errors,
4738 int *byteorder,
4739 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004740{
4741 const char *starts = s;
4742 Py_ssize_t startinpos;
4743 Py_ssize_t endinpos;
4744 Py_ssize_t outpos;
4745 PyUnicodeObject *unicode;
4746 Py_UNICODE *p;
4747#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004748 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004749 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004750#else
4751 const int pairs = 0;
4752#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004753 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004754 int bo = 0; /* assume native ordering by default */
4755 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004756 /* Offsets from q for retrieving bytes in the right order. */
4757#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4758 int iorder[] = {0, 1, 2, 3};
4759#else
4760 int iorder[] = {3, 2, 1, 0};
4761#endif
4762 PyObject *errorHandler = NULL;
4763 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004764
Walter Dörwald41980ca2007-08-16 21:55:45 +00004765 q = (unsigned char *)s;
4766 e = q + size;
4767
4768 if (byteorder)
4769 bo = *byteorder;
4770
4771 /* Check for BOM marks (U+FEFF) in the input and adjust current
4772 byte order setting accordingly. In native mode, the leading BOM
4773 mark is skipped, in all other modes, it is copied to the output
4774 stream as-is (giving a ZWNBSP character). */
4775 if (bo == 0) {
4776 if (size >= 4) {
4777 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004779#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 if (bom == 0x0000FEFF) {
4781 q += 4;
4782 bo = -1;
4783 }
4784 else if (bom == 0xFFFE0000) {
4785 q += 4;
4786 bo = 1;
4787 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004788#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 if (bom == 0x0000FEFF) {
4790 q += 4;
4791 bo = 1;
4792 }
4793 else if (bom == 0xFFFE0000) {
4794 q += 4;
4795 bo = -1;
4796 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004797#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004799 }
4800
4801 if (bo == -1) {
4802 /* force LE */
4803 iorder[0] = 0;
4804 iorder[1] = 1;
4805 iorder[2] = 2;
4806 iorder[3] = 3;
4807 }
4808 else if (bo == 1) {
4809 /* force BE */
4810 iorder[0] = 3;
4811 iorder[1] = 2;
4812 iorder[2] = 1;
4813 iorder[3] = 0;
4814 }
4815
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004816 /* On narrow builds we split characters outside the BMP into two
4817 codepoints => count how much extra space we need. */
4818#ifndef Py_UNICODE_WIDE
4819 for (qq = q; qq < e; qq += 4)
4820 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4821 pairs++;
4822#endif
4823
4824 /* This might be one to much, because of a BOM */
4825 unicode = _PyUnicode_New((size+3)/4+pairs);
4826 if (!unicode)
4827 return NULL;
4828 if (size == 0)
4829 return (PyObject *)unicode;
4830
4831 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004832 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004833
Walter Dörwald41980ca2007-08-16 21:55:45 +00004834 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 Py_UCS4 ch;
4836 /* remaining bytes at the end? (size should be divisible by 4) */
4837 if (e-q<4) {
4838 if (consumed)
4839 break;
4840 errmsg = "truncated data";
4841 startinpos = ((const char *)q)-starts;
4842 endinpos = ((const char *)e)-starts;
4843 goto utf32Error;
4844 /* The remaining input chars are ignored if the callback
4845 chooses to skip the input */
4846 }
4847 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4848 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004849
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 if (ch >= 0x110000)
4851 {
4852 errmsg = "codepoint not in range(0x110000)";
4853 startinpos = ((const char *)q)-starts;
4854 endinpos = startinpos+4;
4855 goto utf32Error;
4856 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004857#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 if (ch >= 0x10000)
4859 {
4860 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4861 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4862 }
4863 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 *p++ = ch;
4866 q += 4;
4867 continue;
4868 utf32Error:
4869 outpos = p-PyUnicode_AS_UNICODE(unicode);
4870 if (unicode_decode_call_errorhandler(
4871 errors, &errorHandler,
4872 "utf32", errmsg,
4873 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4874 &unicode, &outpos, &p))
4875 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004876 }
4877
4878 if (byteorder)
4879 *byteorder = bo;
4880
4881 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004883
4884 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004885 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004886 goto onError;
4887
4888 Py_XDECREF(errorHandler);
4889 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004890#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004891 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004892 Py_DECREF(unicode);
4893 return NULL;
4894 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004895#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004896 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004897 return (PyObject *)unicode;
4898
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004900 Py_DECREF(unicode);
4901 Py_XDECREF(errorHandler);
4902 Py_XDECREF(exc);
4903 return NULL;
4904}
4905
4906PyObject *
4907PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 Py_ssize_t size,
4909 const char *errors,
4910 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004911{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004912 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004914 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004915#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004916 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004917#else
4918 const int pairs = 0;
4919#endif
4920 /* Offsets from p for storing byte pairs in the right order. */
4921#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4922 int iorder[] = {0, 1, 2, 3};
4923#else
4924 int iorder[] = {3, 2, 1, 0};
4925#endif
4926
Benjamin Peterson29060642009-01-31 22:14:21 +00004927#define STORECHAR(CH) \
4928 do { \
4929 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4930 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4931 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4932 p[iorder[0]] = (CH) & 0xff; \
4933 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004934 } while(0)
4935
4936 /* In narrow builds we can output surrogate pairs as one codepoint,
4937 so we need less space. */
4938#ifndef Py_UNICODE_WIDE
4939 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4941 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4942 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004944 nsize = (size - pairs + (byteorder == 0));
4945 bytesize = nsize * 4;
4946 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004948 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949 if (v == NULL)
4950 return NULL;
4951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004952 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004956 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957
4958 if (byteorder == -1) {
4959 /* force LE */
4960 iorder[0] = 0;
4961 iorder[1] = 1;
4962 iorder[2] = 2;
4963 iorder[3] = 3;
4964 }
4965 else if (byteorder == 1) {
4966 /* force BE */
4967 iorder[0] = 3;
4968 iorder[1] = 2;
4969 iorder[2] = 1;
4970 iorder[3] = 0;
4971 }
4972
4973 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4977 Py_UCS4 ch2 = *s;
4978 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4979 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4980 s++;
4981 size--;
4982 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004983 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984#endif
4985 STORECHAR(ch);
4986 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004987
4988 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004989 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990#undef STORECHAR
4991}
4992
Alexander Belopolsky40018472011-02-26 01:02:56 +00004993PyObject *
4994PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004995{
4996 if (!PyUnicode_Check(unicode)) {
4997 PyErr_BadArgument();
4998 return NULL;
4999 }
5000 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 PyUnicode_GET_SIZE(unicode),
5002 NULL,
5003 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004}
5005
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006/* --- UTF-16 Codec ------------------------------------------------------- */
5007
Tim Peters772747b2001-08-09 22:21:55 +00005008PyObject *
5009PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 Py_ssize_t size,
5011 const char *errors,
5012 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013{
Walter Dörwald69652032004-09-07 20:24:22 +00005014 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5015}
5016
Antoine Pitrouab868312009-01-10 15:40:25 +00005017/* Two masks for fast checking of whether a C 'long' may contain
5018 UTF16-encoded surrogate characters. This is an efficient heuristic,
5019 assuming that non-surrogate characters with a code point >= 0x8000 are
5020 rare in most input.
5021 FAST_CHAR_MASK is used when the input is in native byte ordering,
5022 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005023*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005024#if (SIZEOF_LONG == 8)
5025# define FAST_CHAR_MASK 0x8000800080008000L
5026# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5027#elif (SIZEOF_LONG == 4)
5028# define FAST_CHAR_MASK 0x80008000L
5029# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5030#else
5031# error C 'long' size should be either 4 or 8!
5032#endif
5033
Walter Dörwald69652032004-09-07 20:24:22 +00005034PyObject *
5035PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 Py_ssize_t size,
5037 const char *errors,
5038 int *byteorder,
5039 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005040{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 Py_ssize_t startinpos;
5043 Py_ssize_t endinpos;
5044 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045 PyUnicodeObject *unicode;
5046 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005047 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005048 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005049 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005050 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005051 /* Offsets from q for retrieving byte pairs in the right order. */
5052#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5053 int ihi = 1, ilo = 0;
5054#else
5055 int ihi = 0, ilo = 1;
5056#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 PyObject *errorHandler = NULL;
5058 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059
5060 /* Note: size will always be longer than the resulting Unicode
5061 character count */
5062 unicode = _PyUnicode_New(size);
5063 if (!unicode)
5064 return NULL;
5065 if (size == 0)
5066 return (PyObject *)unicode;
5067
5068 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005070 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005071 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072
5073 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005074 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005076 /* Check for BOM marks (U+FEFF) in the input and adjust current
5077 byte order setting accordingly. In native mode, the leading BOM
5078 mark is skipped, in all other modes, it is copied to the output
5079 stream as-is (giving a ZWNBSP character). */
5080 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005081 if (size >= 2) {
5082 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005083#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 if (bom == 0xFEFF) {
5085 q += 2;
5086 bo = -1;
5087 }
5088 else if (bom == 0xFFFE) {
5089 q += 2;
5090 bo = 1;
5091 }
Tim Petersced69f82003-09-16 20:30:58 +00005092#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 if (bom == 0xFEFF) {
5094 q += 2;
5095 bo = 1;
5096 }
5097 else if (bom == 0xFFFE) {
5098 q += 2;
5099 bo = -1;
5100 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005101#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104
Tim Peters772747b2001-08-09 22:21:55 +00005105 if (bo == -1) {
5106 /* force LE */
5107 ihi = 1;
5108 ilo = 0;
5109 }
5110 else if (bo == 1) {
5111 /* force BE */
5112 ihi = 0;
5113 ilo = 1;
5114 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005115#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5116 native_ordering = ilo < ihi;
5117#else
5118 native_ordering = ilo > ihi;
5119#endif
Tim Peters772747b2001-08-09 22:21:55 +00005120
Antoine Pitrouab868312009-01-10 15:40:25 +00005121 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005122 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005124 /* First check for possible aligned read of a C 'long'. Unaligned
5125 reads are more expensive, better to defer to another iteration. */
5126 if (!((size_t) q & LONG_PTR_MASK)) {
5127 /* Fast path for runs of non-surrogate chars. */
5128 register const unsigned char *_q = q;
5129 Py_UNICODE *_p = p;
5130 if (native_ordering) {
5131 /* Native ordering is simple: as long as the input cannot
5132 possibly contain a surrogate char, do an unrolled copy
5133 of several 16-bit code points to the target object.
5134 The non-surrogate check is done on several input bytes
5135 at a time (as many as a C 'long' can contain). */
5136 while (_q < aligned_end) {
5137 unsigned long data = * (unsigned long *) _q;
5138 if (data & FAST_CHAR_MASK)
5139 break;
5140 _p[0] = ((unsigned short *) _q)[0];
5141 _p[1] = ((unsigned short *) _q)[1];
5142#if (SIZEOF_LONG == 8)
5143 _p[2] = ((unsigned short *) _q)[2];
5144 _p[3] = ((unsigned short *) _q)[3];
5145#endif
5146 _q += SIZEOF_LONG;
5147 _p += SIZEOF_LONG / 2;
5148 }
5149 }
5150 else {
5151 /* Byteswapped ordering is similar, but we must decompose
5152 the copy bytewise, and take care of zero'ing out the
5153 upper bytes if the target object is in 32-bit units
5154 (that is, in UCS-4 builds). */
5155 while (_q < aligned_end) {
5156 unsigned long data = * (unsigned long *) _q;
5157 if (data & SWAPPED_FAST_CHAR_MASK)
5158 break;
5159 /* Zero upper bytes in UCS-4 builds */
5160#if (Py_UNICODE_SIZE > 2)
5161 _p[0] = 0;
5162 _p[1] = 0;
5163#if (SIZEOF_LONG == 8)
5164 _p[2] = 0;
5165 _p[3] = 0;
5166#endif
5167#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005168 /* Issue #4916; UCS-4 builds on big endian machines must
5169 fill the two last bytes of each 4-byte unit. */
5170#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5171# define OFF 2
5172#else
5173# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005174#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005175 ((unsigned char *) _p)[OFF + 1] = _q[0];
5176 ((unsigned char *) _p)[OFF + 0] = _q[1];
5177 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5178 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5179#if (SIZEOF_LONG == 8)
5180 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5181 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5182 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5183 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5184#endif
5185#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005186 _q += SIZEOF_LONG;
5187 _p += SIZEOF_LONG / 2;
5188 }
5189 }
5190 p = _p;
5191 q = _q;
5192 if (q >= e)
5193 break;
5194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196
Benjamin Peterson14339b62009-01-31 16:36:08 +00005197 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005198
5199 if (ch < 0xD800 || ch > 0xDFFF) {
5200 *p++ = ch;
5201 continue;
5202 }
5203
5204 /* UTF-16 code pair: */
5205 if (q > e) {
5206 errmsg = "unexpected end of data";
5207 startinpos = (((const char *)q) - 2) - starts;
5208 endinpos = ((const char *)e) + 1 - starts;
5209 goto utf16Error;
5210 }
5211 if (0xD800 <= ch && ch <= 0xDBFF) {
5212 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5213 q += 2;
5214 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005215#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 *p++ = ch;
5217 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005218#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005220#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 continue;
5222 }
5223 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005224 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 startinpos = (((const char *)q)-4)-starts;
5226 endinpos = startinpos+2;
5227 goto utf16Error;
5228 }
5229
Benjamin Peterson14339b62009-01-31 16:36:08 +00005230 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 errmsg = "illegal encoding";
5232 startinpos = (((const char *)q)-2)-starts;
5233 endinpos = startinpos+2;
5234 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005235
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 utf16Error:
5237 outpos = p - PyUnicode_AS_UNICODE(unicode);
5238 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005239 errors,
5240 &errorHandler,
5241 "utf16", errmsg,
5242 &starts,
5243 (const char **)&e,
5244 &startinpos,
5245 &endinpos,
5246 &exc,
5247 (const char **)&q,
5248 &unicode,
5249 &outpos,
5250 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005253 /* remaining byte at the end? (size should be even) */
5254 if (e == q) {
5255 if (!consumed) {
5256 errmsg = "truncated data";
5257 startinpos = ((const char *)q) - starts;
5258 endinpos = ((const char *)e) + 1 - starts;
5259 outpos = p - PyUnicode_AS_UNICODE(unicode);
5260 if (unicode_decode_call_errorhandler(
5261 errors,
5262 &errorHandler,
5263 "utf16", errmsg,
5264 &starts,
5265 (const char **)&e,
5266 &startinpos,
5267 &endinpos,
5268 &exc,
5269 (const char **)&q,
5270 &unicode,
5271 &outpos,
5272 &p))
5273 goto onError;
5274 /* The remaining input chars are ignored if the callback
5275 chooses to skip the input */
5276 }
5277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278
5279 if (byteorder)
5280 *byteorder = bo;
5281
Walter Dörwald69652032004-09-07 20:24:22 +00005282 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005284
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005286 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 goto onError;
5288
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005289 Py_XDECREF(errorHandler);
5290 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005291#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005292 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005293 Py_DECREF(unicode);
5294 return NULL;
5295 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005296#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005297 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 return (PyObject *)unicode;
5299
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005302 Py_XDECREF(errorHandler);
5303 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 return NULL;
5305}
5306
Antoine Pitrouab868312009-01-10 15:40:25 +00005307#undef FAST_CHAR_MASK
5308#undef SWAPPED_FAST_CHAR_MASK
5309
Tim Peters772747b2001-08-09 22:21:55 +00005310PyObject *
5311PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_ssize_t size,
5313 const char *errors,
5314 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005316 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005317 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005318 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005319#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005320 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005321#else
5322 const int pairs = 0;
5323#endif
Tim Peters772747b2001-08-09 22:21:55 +00005324 /* Offsets from p for storing byte pairs in the right order. */
5325#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5326 int ihi = 1, ilo = 0;
5327#else
5328 int ihi = 0, ilo = 1;
5329#endif
5330
Benjamin Peterson29060642009-01-31 22:14:21 +00005331#define STORECHAR(CH) \
5332 do { \
5333 p[ihi] = ((CH) >> 8) & 0xff; \
5334 p[ilo] = (CH) & 0xff; \
5335 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005336 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005338#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005339 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 if (s[i] >= 0x10000)
5341 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005342#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005343 /* 2 * (size + pairs + (byteorder == 0)) */
5344 if (size > PY_SSIZE_T_MAX ||
5345 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005347 nsize = size + pairs + (byteorder == 0);
5348 bytesize = nsize * 2;
5349 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005351 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 if (v == NULL)
5353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005355 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005358 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005359 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005360
5361 if (byteorder == -1) {
5362 /* force LE */
5363 ihi = 1;
5364 ilo = 0;
5365 }
5366 else if (byteorder == 1) {
5367 /* force BE */
5368 ihi = 0;
5369 ilo = 1;
5370 }
5371
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005372 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 Py_UNICODE ch = *s++;
5374 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005375#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 if (ch >= 0x10000) {
5377 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5378 ch = 0xD800 | ((ch-0x10000) >> 10);
5379 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005380#endif
Tim Peters772747b2001-08-09 22:21:55 +00005381 STORECHAR(ch);
5382 if (ch2)
5383 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005384 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005385
5386 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005387 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005388#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389}
5390
Alexander Belopolsky40018472011-02-26 01:02:56 +00005391PyObject *
5392PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393{
5394 if (!PyUnicode_Check(unicode)) {
5395 PyErr_BadArgument();
5396 return NULL;
5397 }
5398 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 PyUnicode_GET_SIZE(unicode),
5400 NULL,
5401 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402}
5403
5404/* --- Unicode Escape Codec ----------------------------------------------- */
5405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005406/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5407 if all the escapes in the string make it still a valid ASCII string.
5408 Returns -1 if any escapes were found which cause the string to
5409 pop out of ASCII range. Otherwise returns the length of the
5410 required buffer to hold the string.
5411 */
5412Py_ssize_t
5413length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5414{
5415 const unsigned char *p = (const unsigned char *)s;
5416 const unsigned char *end = p + size;
5417 Py_ssize_t length = 0;
5418
5419 if (size < 0)
5420 return -1;
5421
5422 for (; p < end; ++p) {
5423 if (*p > 127) {
5424 /* Non-ASCII */
5425 return -1;
5426 }
5427 else if (*p != '\\') {
5428 /* Normal character */
5429 ++length;
5430 }
5431 else {
5432 /* Backslash-escape, check next char */
5433 ++p;
5434 /* Escape sequence reaches till end of string or
5435 non-ASCII follow-up. */
5436 if (p >= end || *p > 127)
5437 return -1;
5438 switch (*p) {
5439 case '\n':
5440 /* backslash + \n result in zero characters */
5441 break;
5442 case '\\': case '\'': case '\"':
5443 case 'b': case 'f': case 't':
5444 case 'n': case 'r': case 'v': case 'a':
5445 ++length;
5446 break;
5447 case '0': case '1': case '2': case '3':
5448 case '4': case '5': case '6': case '7':
5449 case 'x': case 'u': case 'U': case 'N':
5450 /* these do not guarantee ASCII characters */
5451 return -1;
5452 default:
5453 /* count the backslash + the other character */
5454 length += 2;
5455 }
5456 }
5457 }
5458 return length;
5459}
5460
5461/* Similar to PyUnicode_WRITE but either write into wstr field
5462 or treat string as ASCII. */
5463#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5464 do { \
5465 if ((kind) != PyUnicode_WCHAR_KIND) \
5466 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5467 else \
5468 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5469 } while (0)
5470
5471#define WRITE_WSTR(buf, index, value) \
5472 assert(kind == PyUnicode_WCHAR_KIND), \
5473 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5474
5475
Fredrik Lundh06d12682001-01-24 07:59:11 +00005476static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005477
Alexander Belopolsky40018472011-02-26 01:02:56 +00005478PyObject *
5479PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005480 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005481 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005483 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005484 Py_ssize_t startinpos;
5485 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005486 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005490 char* message;
5491 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005492 PyObject *errorHandler = NULL;
5493 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005494 Py_ssize_t ascii_length;
5495 Py_ssize_t i;
5496 int kind;
5497 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005499 ascii_length = length_of_escaped_ascii_string(s, size);
5500
5501 /* After length_of_escaped_ascii_string() there are two alternatives,
5502 either the string is pure ASCII with named escapes like \n, etc.
5503 and we determined it's exact size (common case)
5504 or it contains \x, \u, ... escape sequences. then we create a
5505 legacy wchar string and resize it at the end of this function. */
5506 if (ascii_length >= 0) {
5507 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5508 if (!v)
5509 goto onError;
5510 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5511 kind = PyUnicode_1BYTE_KIND;
5512 data = PyUnicode_DATA(v);
5513 }
5514 else {
5515 /* Escaped strings will always be longer than the resulting
5516 Unicode string, so we start with size here and then reduce the
5517 length after conversion to the true value.
5518 (but if the error callback returns a long replacement string
5519 we'll have to allocate more space) */
5520 v = _PyUnicode_New(size);
5521 if (!v)
5522 goto onError;
5523 kind = PyUnicode_WCHAR_KIND;
5524 data = PyUnicode_AS_UNICODE(v);
5525 }
5526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 if (size == 0)
5528 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005531
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 while (s < end) {
5533 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005534 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005535 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005537 if (kind == PyUnicode_WCHAR_KIND) {
5538 assert(i < _PyUnicode_WSTR_LENGTH(v));
5539 }
5540 else {
5541 /* The only case in which i == ascii_length is a backslash
5542 followed by a newline. */
5543 assert(i <= ascii_length);
5544 }
5545
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 /* Non-escape characters are interpreted as Unicode ordinals */
5547 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 continue;
5550 }
5551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005552 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 /* \ - Escapes */
5554 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005555 c = *s++;
5556 if (s > end)
5557 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558
5559 if (kind == PyUnicode_WCHAR_KIND) {
5560 assert(i < _PyUnicode_WSTR_LENGTH(v));
5561 }
5562 else {
5563 /* The only case in which i == ascii_length is a backslash
5564 followed by a newline. */
5565 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5566 }
5567
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005568 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5573 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5574 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5575 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5576 /* FF */
5577 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5578 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5579 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5580 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5581 /* VT */
5582 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5583 /* BEL, not classic C */
5584 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 case '0': case '1': case '2': case '3':
5588 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005589 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005590 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005591 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005592 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005593 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005595 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 break;
5597
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 /* hex escapes */
5599 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005601 digits = 2;
5602 message = "truncated \\xXX escape";
5603 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005607 digits = 4;
5608 message = "truncated \\uXXXX escape";
5609 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005612 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005613 digits = 8;
5614 message = "truncated \\UXXXXXXXX escape";
5615 hexescape:
5616 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005617 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 if (s+digits>end) {
5619 endinpos = size;
5620 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 errors, &errorHandler,
5622 "unicodeescape", "end of string in escape sequence",
5623 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 goto nextByte;
5628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005629 for (j = 0; j < digits; ++j) {
5630 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005631 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005632 endinpos = (s+j+1)-starts;
5633 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 errors, &errorHandler,
5636 "unicodeescape", message,
5637 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005639 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005642 }
5643 chr = (chr<<4) & ~0xF;
5644 if (c >= '0' && c <= '9')
5645 chr += c - '0';
5646 else if (c >= 'a' && c <= 'f')
5647 chr += 10 + c - 'a';
5648 else
5649 chr += 10 + c - 'A';
5650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005651 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005652 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 /* _decoding_error will have already written into the
5654 target buffer. */
5655 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005656 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005657 /* when we get here, chr is a 32-bit unicode character */
5658 if (chr <= 0xffff)
5659 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005660 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005661 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005662 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005663 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005664#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005666#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005667 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005668 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5669 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005670#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005671 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005673 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 errors, &errorHandler,
5676 "unicodeescape", "illegal Unicode character",
5677 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005679 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005680 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005681 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005682 break;
5683
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005685 case 'N':
5686 message = "malformed \\N character escape";
5687 if (ucnhash_CAPI == NULL) {
5688 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005689 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5690 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 if (ucnhash_CAPI == NULL)
5692 goto ucnhashError;
5693 }
5694 if (*s == '{') {
5695 const char *start = s+1;
5696 /* look for the closing brace */
5697 while (*s != '}' && s < end)
5698 s++;
5699 if (s > start && s < end && *s == '}') {
5700 /* found a name. look it up in the unicode database */
5701 message = "unknown Unicode character name";
5702 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5704 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005705 goto store;
5706 }
5707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 errors, &errorHandler,
5712 "unicodeescape", message,
5713 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005714 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005715 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005717 break;
5718
5719 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005720 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005721 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 message = "\\ at end of string";
5723 s--;
5724 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005725 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 errors, &errorHandler,
5728 "unicodeescape", message,
5729 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005730 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005731 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005732 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005733 }
5734 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005735 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5736 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005737 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005738 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005743 /* Ensure the length prediction worked in case of ASCII strings */
5744 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5745
Victor Stinnerfe226c02011-10-03 03:52:20 +02005746 if (kind == PyUnicode_WCHAR_KIND)
5747 {
5748 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5749 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005750 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005751 Py_XDECREF(errorHandler);
5752 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005753#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005754 if (_PyUnicode_READY_REPLACE(&v)) {
5755 Py_DECREF(v);
5756 return NULL;
5757 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005758#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005759 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005763 PyErr_SetString(
5764 PyExc_UnicodeError,
5765 "\\N escapes not supported (can't load unicodedata module)"
5766 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005767 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 Py_XDECREF(errorHandler);
5769 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005770 return NULL;
5771
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 Py_XDECREF(errorHandler);
5775 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 return NULL;
5777}
5778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005779#undef WRITE_ASCII_OR_WSTR
5780#undef WRITE_WSTR
5781
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782/* Return a Unicode-Escape string version of the Unicode object.
5783
5784 If quotes is true, the string is enclosed in u"" or u'' quotes as
5785 appropriate.
5786
5787*/
5788
Walter Dörwald79e913e2007-05-12 11:08:06 +00005789static const char *hexdigits = "0123456789abcdef";
5790
Alexander Belopolsky40018472011-02-26 01:02:56 +00005791PyObject *
5792PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005793 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005795 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005798#ifdef Py_UNICODE_WIDE
5799 const Py_ssize_t expandsize = 10;
5800#else
5801 const Py_ssize_t expandsize = 6;
5802#endif
5803
Thomas Wouters89f507f2006-12-13 04:49:30 +00005804 /* XXX(nnorwitz): rather than over-allocating, it would be
5805 better to choose a different scheme. Perhaps scan the
5806 first N-chars of the string and allocate based on that size.
5807 */
5808 /* Initial allocation is based on the longest-possible unichr
5809 escape.
5810
5811 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5812 unichr, so in this case it's the longest unichr escape. In
5813 narrow (UTF-16) builds this is five chars per source unichr
5814 since there are two unichrs in the surrogate pair, so in narrow
5815 (UTF-16) builds it's not the longest unichr escape.
5816
5817 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5818 so in the narrow (UTF-16) build case it's the longest unichr
5819 escape.
5820 */
5821
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005822 if (size == 0)
5823 return PyBytes_FromStringAndSize(NULL, 0);
5824
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005825 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005827
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005828 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 2
5830 + expandsize*size
5831 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 if (repr == NULL)
5833 return NULL;
5834
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005835 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 while (size-- > 0) {
5838 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005839
Walter Dörwald79e913e2007-05-12 11:08:06 +00005840 /* Escape backslashes */
5841 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 *p++ = '\\';
5843 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005844 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005845 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005846
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005847#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005848 /* Map 21-bit characters to '\U00xxxxxx' */
5849 else if (ch >= 0x10000) {
5850 *p++ = '\\';
5851 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005852 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5853 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5854 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5855 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5856 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5857 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5858 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5859 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005861 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005862#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5864 else if (ch >= 0xD800 && ch < 0xDC00) {
5865 Py_UNICODE ch2;
5866 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005867
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 ch2 = *s++;
5869 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005870 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5872 *p++ = '\\';
5873 *p++ = 'U';
5874 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5875 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5876 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5877 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5878 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5879 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5880 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5881 *p++ = hexdigits[ucs & 0x0000000F];
5882 continue;
5883 }
5884 /* Fall through: isolated surrogates are copied as-is */
5885 s--;
5886 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005887 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005888#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005891 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 *p++ = '\\';
5893 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005894 *p++ = hexdigits[(ch >> 12) & 0x000F];
5895 *p++ = hexdigits[(ch >> 8) & 0x000F];
5896 *p++ = hexdigits[(ch >> 4) & 0x000F];
5897 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005899
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005900 /* Map special whitespace to '\t', \n', '\r' */
5901 else if (ch == '\t') {
5902 *p++ = '\\';
5903 *p++ = 't';
5904 }
5905 else if (ch == '\n') {
5906 *p++ = '\\';
5907 *p++ = 'n';
5908 }
5909 else if (ch == '\r') {
5910 *p++ = '\\';
5911 *p++ = 'r';
5912 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005913
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005914 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005915 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005917 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005918 *p++ = hexdigits[(ch >> 4) & 0x000F];
5919 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005920 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005921
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 /* Copy everything else as-is */
5923 else
5924 *p++ = (char) ch;
5925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005927 assert(p - PyBytes_AS_STRING(repr) > 0);
5928 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5929 return NULL;
5930 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931}
5932
Alexander Belopolsky40018472011-02-26 01:02:56 +00005933PyObject *
5934PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005936 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 if (!PyUnicode_Check(unicode)) {
5938 PyErr_BadArgument();
5939 return NULL;
5940 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005941 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5942 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005943 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944}
5945
5946/* --- Raw Unicode Escape Codec ------------------------------------------- */
5947
Alexander Belopolsky40018472011-02-26 01:02:56 +00005948PyObject *
5949PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005950 Py_ssize_t size,
5951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005954 Py_ssize_t startinpos;
5955 Py_ssize_t endinpos;
5956 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 const char *end;
5960 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 PyObject *errorHandler = NULL;
5962 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005963
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 /* Escaped strings will always be longer than the resulting
5965 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966 length after conversion to the true value. (But decoding error
5967 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 v = _PyUnicode_New(size);
5969 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 end = s + size;
5975 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 unsigned char c;
5977 Py_UCS4 x;
5978 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005979 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 /* Non-escape characters are interpreted as Unicode ordinals */
5982 if (*s != '\\') {
5983 *p++ = (unsigned char)*s++;
5984 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005985 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 startinpos = s-starts;
5987
5988 /* \u-escapes are only interpreted iff the number of leading
5989 backslashes if odd */
5990 bs = s;
5991 for (;s < end;) {
5992 if (*s != '\\')
5993 break;
5994 *p++ = (unsigned char)*s++;
5995 }
5996 if (((s - bs) & 1) == 0 ||
5997 s >= end ||
5998 (*s != 'u' && *s != 'U')) {
5999 continue;
6000 }
6001 p--;
6002 count = *s=='u' ? 4 : 8;
6003 s++;
6004
6005 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6006 outpos = p-PyUnicode_AS_UNICODE(v);
6007 for (x = 0, i = 0; i < count; ++i, ++s) {
6008 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006009 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 endinpos = s-starts;
6011 if (unicode_decode_call_errorhandler(
6012 errors, &errorHandler,
6013 "rawunicodeescape", "truncated \\uXXXX",
6014 &starts, &end, &startinpos, &endinpos, &exc, &s,
6015 &v, &outpos, &p))
6016 goto onError;
6017 goto nextByte;
6018 }
6019 x = (x<<4) & ~0xF;
6020 if (c >= '0' && c <= '9')
6021 x += c - '0';
6022 else if (c >= 'a' && c <= 'f')
6023 x += 10 + c - 'a';
6024 else
6025 x += 10 + c - 'A';
6026 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006027 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 /* UCS-2 character */
6029 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006030 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 /* UCS-4 character. Either store directly, or as
6032 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006033#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006035#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 x -= 0x10000L;
6037 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6038 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006039#endif
6040 } else {
6041 endinpos = s-starts;
6042 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006043 if (unicode_decode_call_errorhandler(
6044 errors, &errorHandler,
6045 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 &starts, &end, &startinpos, &endinpos, &exc, &s,
6047 &v, &outpos, &p))
6048 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006049 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 nextByte:
6051 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006053 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006055 Py_XDECREF(errorHandler);
6056 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006057#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006058 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006059 Py_DECREF(v);
6060 return NULL;
6061 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006062#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006063 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006065
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 Py_XDECREF(errorHandler);
6069 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 return NULL;
6071}
6072
Alexander Belopolsky40018472011-02-26 01:02:56 +00006073PyObject *
6074PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006075 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006077 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 char *p;
6079 char *q;
6080
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006081#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006082 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006083#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006084 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006085#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006086
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006087 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006090 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 if (repr == NULL)
6092 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006093 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006094 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006096 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 while (size-- > 0) {
6098 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006099#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 /* Map 32-bit characters to '\Uxxxxxxxx' */
6101 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006102 *p++ = '\\';
6103 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006104 *p++ = hexdigits[(ch >> 28) & 0xf];
6105 *p++ = hexdigits[(ch >> 24) & 0xf];
6106 *p++ = hexdigits[(ch >> 20) & 0xf];
6107 *p++ = hexdigits[(ch >> 16) & 0xf];
6108 *p++ = hexdigits[(ch >> 12) & 0xf];
6109 *p++ = hexdigits[(ch >> 8) & 0xf];
6110 *p++ = hexdigits[(ch >> 4) & 0xf];
6111 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006112 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006113 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006114#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6116 if (ch >= 0xD800 && ch < 0xDC00) {
6117 Py_UNICODE ch2;
6118 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006119
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 ch2 = *s++;
6121 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006122 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6124 *p++ = '\\';
6125 *p++ = 'U';
6126 *p++ = hexdigits[(ucs >> 28) & 0xf];
6127 *p++ = hexdigits[(ucs >> 24) & 0xf];
6128 *p++ = hexdigits[(ucs >> 20) & 0xf];
6129 *p++ = hexdigits[(ucs >> 16) & 0xf];
6130 *p++ = hexdigits[(ucs >> 12) & 0xf];
6131 *p++ = hexdigits[(ucs >> 8) & 0xf];
6132 *p++ = hexdigits[(ucs >> 4) & 0xf];
6133 *p++ = hexdigits[ucs & 0xf];
6134 continue;
6135 }
6136 /* Fall through: isolated surrogates are copied as-is */
6137 s--;
6138 size++;
6139 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006140#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 /* Map 16-bit characters to '\uxxxx' */
6142 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 *p++ = '\\';
6144 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006145 *p++ = hexdigits[(ch >> 12) & 0xf];
6146 *p++ = hexdigits[(ch >> 8) & 0xf];
6147 *p++ = hexdigits[(ch >> 4) & 0xf];
6148 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 /* Copy everything else as-is */
6151 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 *p++ = (char) ch;
6153 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006154 size = p - q;
6155
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 assert(size > 0);
6157 if (_PyBytes_Resize(&repr, size) < 0)
6158 return NULL;
6159 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160}
6161
Alexander Belopolsky40018472011-02-26 01:02:56 +00006162PyObject *
6163PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006165 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006167 PyErr_BadArgument();
6168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006170 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6171 PyUnicode_GET_SIZE(unicode));
6172
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006173 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174}
6175
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006176/* --- Unicode Internal Codec ------------------------------------------- */
6177
Alexander Belopolsky40018472011-02-26 01:02:56 +00006178PyObject *
6179_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006180 Py_ssize_t size,
6181 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006182{
6183 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006184 Py_ssize_t startinpos;
6185 Py_ssize_t endinpos;
6186 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006187 PyUnicodeObject *v;
6188 Py_UNICODE *p;
6189 const char *end;
6190 const char *reason;
6191 PyObject *errorHandler = NULL;
6192 PyObject *exc = NULL;
6193
Neal Norwitzd43069c2006-01-08 01:12:10 +00006194#ifdef Py_UNICODE_WIDE
6195 Py_UNICODE unimax = PyUnicode_GetMax();
6196#endif
6197
Thomas Wouters89f507f2006-12-13 04:49:30 +00006198 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006199 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6200 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006202 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6203 as string was created with the old API. */
6204 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006206 p = PyUnicode_AS_UNICODE(v);
6207 end = s + size;
6208
6209 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006210 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006211 /* We have to sanity check the raw data, otherwise doom looms for
6212 some malformed UCS-4 data. */
6213 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006214#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006215 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006216#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006217 end-s < Py_UNICODE_SIZE
6218 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006220 startinpos = s - starts;
6221 if (end-s < Py_UNICODE_SIZE) {
6222 endinpos = end-starts;
6223 reason = "truncated input";
6224 }
6225 else {
6226 endinpos = s - starts + Py_UNICODE_SIZE;
6227 reason = "illegal code point (> 0x10FFFF)";
6228 }
6229 outpos = p - PyUnicode_AS_UNICODE(v);
6230 if (unicode_decode_call_errorhandler(
6231 errors, &errorHandler,
6232 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006233 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006234 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006235 goto onError;
6236 }
6237 }
6238 else {
6239 p++;
6240 s += Py_UNICODE_SIZE;
6241 }
6242 }
6243
Victor Stinnerfe226c02011-10-03 03:52:20 +02006244 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006245 goto onError;
6246 Py_XDECREF(errorHandler);
6247 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006248#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006249 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006250 Py_DECREF(v);
6251 return NULL;
6252 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006253#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006254 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006255 return (PyObject *)v;
6256
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006258 Py_XDECREF(v);
6259 Py_XDECREF(errorHandler);
6260 Py_XDECREF(exc);
6261 return NULL;
6262}
6263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264/* --- Latin-1 Codec ------------------------------------------------------ */
6265
Alexander Belopolsky40018472011-02-26 01:02:56 +00006266PyObject *
6267PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006268 Py_ssize_t size,
6269 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006272 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273}
6274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006275/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006276static void
6277make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006278 const char *encoding,
6279 const Py_UNICODE *unicode, Py_ssize_t size,
6280 Py_ssize_t startpos, Py_ssize_t endpos,
6281 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 *exceptionObject = PyUnicodeEncodeError_Create(
6285 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 }
6287 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6289 goto onError;
6290 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6291 goto onError;
6292 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6293 goto onError;
6294 return;
6295 onError:
6296 Py_DECREF(*exceptionObject);
6297 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 }
6299}
6300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006302static void
6303raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006304 const char *encoding,
6305 const Py_UNICODE *unicode, Py_ssize_t size,
6306 Py_ssize_t startpos, Py_ssize_t endpos,
6307 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006308{
6309 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006313}
6314
6315/* error handling callback helper:
6316 build arguments, call the callback and check the arguments,
6317 put the result into newpos and return the replacement string, which
6318 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006319static PyObject *
6320unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006321 PyObject **errorHandler,
6322 const char *encoding, const char *reason,
6323 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6324 Py_ssize_t startpos, Py_ssize_t endpos,
6325 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006326{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006327 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006328
6329 PyObject *restuple;
6330 PyObject *resunicode;
6331
6332 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336 }
6337
6338 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342
6343 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006348 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 Py_DECREF(restuple);
6350 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006352 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 &resunicode, newpos)) {
6354 Py_DECREF(restuple);
6355 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006357 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6358 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6359 Py_DECREF(restuple);
6360 return NULL;
6361 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006364 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6366 Py_DECREF(restuple);
6367 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006368 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 Py_INCREF(resunicode);
6370 Py_DECREF(restuple);
6371 return resunicode;
6372}
6373
Alexander Belopolsky40018472011-02-26 01:02:56 +00006374static PyObject *
6375unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006376 Py_ssize_t size,
6377 const char *errors,
6378 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379{
6380 /* output object */
6381 PyObject *res;
6382 /* pointers to the beginning and end+1 of input */
6383 const Py_UNICODE *startp = p;
6384 const Py_UNICODE *endp = p + size;
6385 /* pointer to the beginning of the unencodable characters */
6386 /* const Py_UNICODE *badp = NULL; */
6387 /* pointer into the output */
6388 char *str;
6389 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006390 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006391 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6392 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 PyObject *errorHandler = NULL;
6394 PyObject *exc = NULL;
6395 /* the following variable is used for caching string comparisons
6396 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6397 int known_errorHandler = -1;
6398
6399 /* allocate enough for a simple encoding without
6400 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006401 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006402 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006403 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006405 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006406 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 ressize = size;
6408
6409 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 /* can we encode this? */
6413 if (c<limit) {
6414 /* no overflow check, because we know that the space is enough */
6415 *str++ = (char)c;
6416 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006417 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 else {
6419 Py_ssize_t unicodepos = p-startp;
6420 Py_ssize_t requiredsize;
6421 PyObject *repunicode;
6422 Py_ssize_t repsize;
6423 Py_ssize_t newpos;
6424 Py_ssize_t respos;
6425 Py_UNICODE *uni2;
6426 /* startpos for collecting unencodable chars */
6427 const Py_UNICODE *collstart = p;
6428 const Py_UNICODE *collend = p;
6429 /* find all unecodable characters */
6430 while ((collend < endp) && ((*collend)>=limit))
6431 ++collend;
6432 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6433 if (known_errorHandler==-1) {
6434 if ((errors==NULL) || (!strcmp(errors, "strict")))
6435 known_errorHandler = 1;
6436 else if (!strcmp(errors, "replace"))
6437 known_errorHandler = 2;
6438 else if (!strcmp(errors, "ignore"))
6439 known_errorHandler = 3;
6440 else if (!strcmp(errors, "xmlcharrefreplace"))
6441 known_errorHandler = 4;
6442 else
6443 known_errorHandler = 0;
6444 }
6445 switch (known_errorHandler) {
6446 case 1: /* strict */
6447 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6448 goto onError;
6449 case 2: /* replace */
6450 while (collstart++<collend)
6451 *str++ = '?'; /* fall through */
6452 case 3: /* ignore */
6453 p = collend;
6454 break;
6455 case 4: /* xmlcharrefreplace */
6456 respos = str - PyBytes_AS_STRING(res);
6457 /* determine replacement size (temporarily (mis)uses p) */
6458 for (p = collstart, repsize = 0; p < collend; ++p) {
6459 if (*p<10)
6460 repsize += 2+1+1;
6461 else if (*p<100)
6462 repsize += 2+2+1;
6463 else if (*p<1000)
6464 repsize += 2+3+1;
6465 else if (*p<10000)
6466 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006467#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 else
6469 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006470#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 else if (*p<100000)
6472 repsize += 2+5+1;
6473 else if (*p<1000000)
6474 repsize += 2+6+1;
6475 else
6476 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006477#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 }
6479 requiredsize = respos+repsize+(endp-collend);
6480 if (requiredsize > ressize) {
6481 if (requiredsize<2*ressize)
6482 requiredsize = 2*ressize;
6483 if (_PyBytes_Resize(&res, requiredsize))
6484 goto onError;
6485 str = PyBytes_AS_STRING(res) + respos;
6486 ressize = requiredsize;
6487 }
6488 /* generate replacement (temporarily (mis)uses p) */
6489 for (p = collstart; p < collend; ++p) {
6490 str += sprintf(str, "&#%d;", (int)*p);
6491 }
6492 p = collend;
6493 break;
6494 default:
6495 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6496 encoding, reason, startp, size, &exc,
6497 collstart-startp, collend-startp, &newpos);
6498 if (repunicode == NULL)
6499 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006500 if (PyBytes_Check(repunicode)) {
6501 /* Directly copy bytes result to output. */
6502 repsize = PyBytes_Size(repunicode);
6503 if (repsize > 1) {
6504 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006505 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006506 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6507 Py_DECREF(repunicode);
6508 goto onError;
6509 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006510 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006511 ressize += repsize-1;
6512 }
6513 memcpy(str, PyBytes_AsString(repunicode), repsize);
6514 str += repsize;
6515 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006516 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006517 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006518 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 /* need more space? (at least enough for what we
6520 have+the replacement+the rest of the string, so
6521 we won't have to check space for encodable characters) */
6522 respos = str - PyBytes_AS_STRING(res);
6523 repsize = PyUnicode_GET_SIZE(repunicode);
6524 requiredsize = respos+repsize+(endp-collend);
6525 if (requiredsize > ressize) {
6526 if (requiredsize<2*ressize)
6527 requiredsize = 2*ressize;
6528 if (_PyBytes_Resize(&res, requiredsize)) {
6529 Py_DECREF(repunicode);
6530 goto onError;
6531 }
6532 str = PyBytes_AS_STRING(res) + respos;
6533 ressize = requiredsize;
6534 }
6535 /* check if there is anything unencodable in the replacement
6536 and copy it to the output */
6537 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6538 c = *uni2;
6539 if (c >= limit) {
6540 raise_encode_exception(&exc, encoding, startp, size,
6541 unicodepos, unicodepos+1, reason);
6542 Py_DECREF(repunicode);
6543 goto onError;
6544 }
6545 *str = (char)c;
6546 }
6547 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006548 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006549 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006550 }
6551 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006552 /* Resize if we allocated to much */
6553 size = str - PyBytes_AS_STRING(res);
6554 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006555 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006556 if (_PyBytes_Resize(&res, size) < 0)
6557 goto onError;
6558 }
6559
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006560 Py_XDECREF(errorHandler);
6561 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006562 return res;
6563
6564 onError:
6565 Py_XDECREF(res);
6566 Py_XDECREF(errorHandler);
6567 Py_XDECREF(exc);
6568 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569}
6570
Alexander Belopolsky40018472011-02-26 01:02:56 +00006571PyObject *
6572PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006573 Py_ssize_t size,
6574 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577}
6578
Alexander Belopolsky40018472011-02-26 01:02:56 +00006579PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006580_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581{
6582 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 PyErr_BadArgument();
6584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006586 if (PyUnicode_READY(unicode) == -1)
6587 return NULL;
6588 /* Fast path: if it is a one-byte string, construct
6589 bytes object directly. */
6590 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6591 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6592 PyUnicode_GET_LENGTH(unicode));
6593 /* Non-Latin-1 characters present. Defer to above function to
6594 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006597 errors);
6598}
6599
6600PyObject*
6601PyUnicode_AsLatin1String(PyObject *unicode)
6602{
6603 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
6606/* --- 7-bit ASCII Codec -------------------------------------------------- */
6607
Alexander Belopolsky40018472011-02-26 01:02:56 +00006608PyObject *
6609PyUnicode_DecodeASCII(const char *s,
6610 Py_ssize_t size,
6611 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006613 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006615 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006616 Py_ssize_t startinpos;
6617 Py_ssize_t endinpos;
6618 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006619 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006620 int has_error;
6621 const unsigned char *p = (const unsigned char *)s;
6622 const unsigned char *end = p + size;
6623 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624 PyObject *errorHandler = NULL;
6625 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006626
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006628 if (size == 1 && (unsigned char)s[0] < 128)
6629 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006630
Victor Stinner702c7342011-10-05 13:50:52 +02006631 has_error = 0;
6632 while (p < end && !has_error) {
6633 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6634 an explanation. */
6635 if (!((size_t) p & LONG_PTR_MASK)) {
6636 /* Help register allocation */
6637 register const unsigned char *_p = p;
6638 while (_p < aligned_end) {
6639 unsigned long value = *(unsigned long *) _p;
6640 if (value & ASCII_CHAR_MASK) {
6641 has_error = 1;
6642 break;
6643 }
6644 _p += SIZEOF_LONG;
6645 }
6646 if (_p == end)
6647 break;
6648 if (has_error)
6649 break;
6650 p = _p;
6651 }
6652 if (*p & 0x80) {
6653 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006654 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006655 }
6656 else {
6657 ++p;
6658 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006659 }
Victor Stinner702c7342011-10-05 13:50:52 +02006660 if (!has_error)
6661 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006662
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 v = _PyUnicode_New(size);
6664 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006668 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 e = s + size;
6670 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 register unsigned char c = (unsigned char)*s;
6672 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006673 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 ++s;
6675 }
6676 else {
6677 startinpos = s-starts;
6678 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006679 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 if (unicode_decode_call_errorhandler(
6681 errors, &errorHandler,
6682 "ascii", "ordinal not in range(128)",
6683 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006684 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 goto onError;
6686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 }
Victor Stinner702c7342011-10-05 13:50:52 +02006688 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6689 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 Py_XDECREF(errorHandler);
6692 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006693#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006694 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006695 Py_DECREF(v);
6696 return NULL;
6697 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006698#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006699 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006701
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704 Py_XDECREF(errorHandler);
6705 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 return NULL;
6707}
6708
Alexander Belopolsky40018472011-02-26 01:02:56 +00006709PyObject *
6710PyUnicode_EncodeASCII(const Py_UNICODE *p,
6711 Py_ssize_t size,
6712 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715}
6716
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006718_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
6720 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 PyErr_BadArgument();
6722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006724 if (PyUnicode_READY(unicode) == -1)
6725 return NULL;
6726 /* Fast path: if it is an ASCII-only string, construct bytes object
6727 directly. Else defer to above function to raise the exception. */
6728 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6729 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6730 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006733 errors);
6734}
6735
6736PyObject *
6737PyUnicode_AsASCIIString(PyObject *unicode)
6738{
6739 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740}
6741
Victor Stinner99b95382011-07-04 14:23:54 +02006742#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006743
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006744/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006745
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006746#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006747#define NEED_RETRY
6748#endif
6749
6750/* XXX This code is limited to "true" double-byte encodings, as
6751 a) it assumes an incomplete character consists of a single byte, and
6752 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006754
Alexander Belopolsky40018472011-02-26 01:02:56 +00006755static int
6756is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006757{
6758 const char *curr = s + offset;
6759
6760 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 const char *prev = CharPrev(s, curr);
6762 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763 }
6764 return 0;
6765}
6766
6767/*
6768 * Decode MBCS string into unicode object. If 'final' is set, converts
6769 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6770 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006771static int
6772decode_mbcs(PyUnicodeObject **v,
6773 const char *s, /* MBCS string */
6774 int size, /* sizeof MBCS string */
6775 int final,
6776 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006777{
6778 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006779 Py_ssize_t n;
6780 DWORD usize;
6781 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006782
6783 assert(size >= 0);
6784
Victor Stinner554f3f02010-06-16 23:33:54 +00006785 /* check and handle 'errors' arg */
6786 if (errors==NULL || strcmp(errors, "strict")==0)
6787 flags = MB_ERR_INVALID_CHARS;
6788 else if (strcmp(errors, "ignore")==0)
6789 flags = 0;
6790 else {
6791 PyErr_Format(PyExc_ValueError,
6792 "mbcs encoding does not support errors='%s'",
6793 errors);
6794 return -1;
6795 }
6796
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006797 /* Skip trailing lead-byte unless 'final' is set */
6798 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800
6801 /* First get the size of the result */
6802 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006803 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6804 if (usize==0)
6805 goto mbcs_decode_error;
6806 } else
6807 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006808
6809 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 /* Create unicode object */
6811 *v = _PyUnicode_New(usize);
6812 if (*v == NULL)
6813 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006814 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006815 }
6816 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 /* Extend unicode object */
6818 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006819 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006821 }
6822
6823 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006824 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006826 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6827 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006830 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006831
6832mbcs_decode_error:
6833 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6834 we raise a UnicodeDecodeError - else it is a 'generic'
6835 windows error
6836 */
6837 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6838 /* Ideally, we should get reason from FormatMessage - this
6839 is the Windows 2000 English version of the message
6840 */
6841 PyObject *exc = NULL;
6842 const char *reason = "No mapping for the Unicode character exists "
6843 "in the target multi-byte code page.";
6844 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6845 if (exc != NULL) {
6846 PyCodec_StrictErrors(exc);
6847 Py_DECREF(exc);
6848 }
6849 } else {
6850 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6851 }
6852 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853}
6854
Alexander Belopolsky40018472011-02-26 01:02:56 +00006855PyObject *
6856PyUnicode_DecodeMBCSStateful(const char *s,
6857 Py_ssize_t size,
6858 const char *errors,
6859 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006860{
6861 PyUnicodeObject *v = NULL;
6862 int done;
6863
6864 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006866
6867#ifdef NEED_RETRY
6868 retry:
6869 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006870 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871 else
6872#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006873 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874
6875 if (done < 0) {
6876 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006878 }
6879
6880 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882
6883#ifdef NEED_RETRY
6884 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 s += done;
6886 size -= done;
6887 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888 }
6889#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006890#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006891 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006892 Py_DECREF(v);
6893 return NULL;
6894 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006895#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006896 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897 return (PyObject *)v;
6898}
6899
Alexander Belopolsky40018472011-02-26 01:02:56 +00006900PyObject *
6901PyUnicode_DecodeMBCS(const char *s,
6902 Py_ssize_t size,
6903 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006904{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6906}
6907
6908/*
6909 * Convert unicode into string object (MBCS).
6910 * Returns 0 if succeed, -1 otherwise.
6911 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912static int
6913encode_mbcs(PyObject **repr,
6914 const Py_UNICODE *p, /* unicode */
6915 int size, /* size of unicode */
6916 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006917{
Victor Stinner554f3f02010-06-16 23:33:54 +00006918 BOOL usedDefaultChar = FALSE;
6919 BOOL *pusedDefaultChar;
6920 int mbcssize;
6921 Py_ssize_t n;
6922 PyObject *exc = NULL;
6923 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924
6925 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006926
Victor Stinner554f3f02010-06-16 23:33:54 +00006927 /* check and handle 'errors' arg */
6928 if (errors==NULL || strcmp(errors, "strict")==0) {
6929 flags = WC_NO_BEST_FIT_CHARS;
6930 pusedDefaultChar = &usedDefaultChar;
6931 } else if (strcmp(errors, "replace")==0) {
6932 flags = 0;
6933 pusedDefaultChar = NULL;
6934 } else {
6935 PyErr_Format(PyExc_ValueError,
6936 "mbcs encoding does not support errors='%s'",
6937 errors);
6938 return -1;
6939 }
6940
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006941 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006943 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6944 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 if (mbcssize == 0) {
6946 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6947 return -1;
6948 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006949 /* If we used a default char, then we failed! */
6950 if (pusedDefaultChar && *pusedDefaultChar)
6951 goto mbcs_encode_error;
6952 } else {
6953 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006954 }
6955
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006956 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 /* Create string object */
6958 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6959 if (*repr == NULL)
6960 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006961 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006962 }
6963 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 /* Extend string object */
6965 n = PyBytes_Size(*repr);
6966 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6967 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968 }
6969
6970 /* Do the conversion */
6971 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006973 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6974 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6976 return -1;
6977 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006978 if (pusedDefaultChar && *pusedDefaultChar)
6979 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006981 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006982
6983mbcs_encode_error:
6984 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6985 Py_XDECREF(exc);
6986 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006987}
6988
Alexander Belopolsky40018472011-02-26 01:02:56 +00006989PyObject *
6990PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6991 Py_ssize_t size,
6992 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006993{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 PyObject *repr = NULL;
6995 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006996
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006997#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006999 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007000 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001 else
7002#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007003 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007004
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007005 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 Py_XDECREF(repr);
7007 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007008 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007009
7010#ifdef NEED_RETRY
7011 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 p += INT_MAX;
7013 size -= INT_MAX;
7014 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015 }
7016#endif
7017
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007018 return repr;
7019}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007020
Alexander Belopolsky40018472011-02-26 01:02:56 +00007021PyObject *
7022PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007023{
7024 if (!PyUnicode_Check(unicode)) {
7025 PyErr_BadArgument();
7026 return NULL;
7027 }
7028 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 PyUnicode_GET_SIZE(unicode),
7030 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007031}
7032
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007033#undef NEED_RETRY
7034
Victor Stinner99b95382011-07-04 14:23:54 +02007035#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007036
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037/* --- Character Mapping Codec -------------------------------------------- */
7038
Alexander Belopolsky40018472011-02-26 01:02:56 +00007039PyObject *
7040PyUnicode_DecodeCharmap(const char *s,
7041 Py_ssize_t size,
7042 PyObject *mapping,
7043 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007045 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007046 Py_ssize_t startinpos;
7047 Py_ssize_t endinpos;
7048 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007049 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 PyUnicodeObject *v;
7051 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007052 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 PyObject *errorHandler = NULL;
7054 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007055 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007056 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007057
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 /* Default to Latin-1 */
7059 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061
7062 v = _PyUnicode_New(size);
7063 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007068 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007069 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 mapstring = PyUnicode_AS_UNICODE(mapping);
7071 maplen = PyUnicode_GET_SIZE(mapping);
7072 while (s < e) {
7073 unsigned char ch = *s;
7074 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 if (ch < maplen)
7077 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078
Benjamin Peterson29060642009-01-31 22:14:21 +00007079 if (x == 0xfffe) {
7080 /* undefined mapping */
7081 outpos = p-PyUnicode_AS_UNICODE(v);
7082 startinpos = s-starts;
7083 endinpos = startinpos+1;
7084 if (unicode_decode_call_errorhandler(
7085 errors, &errorHandler,
7086 "charmap", "character maps to <undefined>",
7087 &starts, &e, &startinpos, &endinpos, &exc, &s,
7088 &v, &outpos, &p)) {
7089 goto onError;
7090 }
7091 continue;
7092 }
7093 *p++ = x;
7094 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007095 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007096 }
7097 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 while (s < e) {
7099 unsigned char ch = *s;
7100 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007101
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7103 w = PyLong_FromLong((long)ch);
7104 if (w == NULL)
7105 goto onError;
7106 x = PyObject_GetItem(mapping, w);
7107 Py_DECREF(w);
7108 if (x == NULL) {
7109 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7110 /* No mapping found means: mapping is undefined. */
7111 PyErr_Clear();
7112 x = Py_None;
7113 Py_INCREF(x);
7114 } else
7115 goto onError;
7116 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007117
Benjamin Peterson29060642009-01-31 22:14:21 +00007118 /* Apply mapping */
7119 if (PyLong_Check(x)) {
7120 long value = PyLong_AS_LONG(x);
7121 if (value < 0 || value > 65535) {
7122 PyErr_SetString(PyExc_TypeError,
7123 "character mapping must be in range(65536)");
7124 Py_DECREF(x);
7125 goto onError;
7126 }
7127 *p++ = (Py_UNICODE)value;
7128 }
7129 else if (x == Py_None) {
7130 /* undefined mapping */
7131 outpos = p-PyUnicode_AS_UNICODE(v);
7132 startinpos = s-starts;
7133 endinpos = startinpos+1;
7134 if (unicode_decode_call_errorhandler(
7135 errors, &errorHandler,
7136 "charmap", "character maps to <undefined>",
7137 &starts, &e, &startinpos, &endinpos, &exc, &s,
7138 &v, &outpos, &p)) {
7139 Py_DECREF(x);
7140 goto onError;
7141 }
7142 Py_DECREF(x);
7143 continue;
7144 }
7145 else if (PyUnicode_Check(x)) {
7146 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007147
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 if (targetsize == 1)
7149 /* 1-1 mapping */
7150 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007151
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 else if (targetsize > 1) {
7153 /* 1-n mapping */
7154 if (targetsize > extrachars) {
7155 /* resize first */
7156 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7157 Py_ssize_t needed = (targetsize - extrachars) + \
7158 (targetsize << 2);
7159 extrachars += needed;
7160 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007161 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 PyUnicode_GET_SIZE(v) + needed) < 0) {
7163 Py_DECREF(x);
7164 goto onError;
7165 }
7166 p = PyUnicode_AS_UNICODE(v) + oldpos;
7167 }
7168 Py_UNICODE_COPY(p,
7169 PyUnicode_AS_UNICODE(x),
7170 targetsize);
7171 p += targetsize;
7172 extrachars -= targetsize;
7173 }
7174 /* 1-0 mapping: skip the character */
7175 }
7176 else {
7177 /* wrong return value */
7178 PyErr_SetString(PyExc_TypeError,
7179 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007180 Py_DECREF(x);
7181 goto onError;
7182 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 Py_DECREF(x);
7184 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 }
7187 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007188 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007190 Py_XDECREF(errorHandler);
7191 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007192#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007193 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007194 Py_DECREF(v);
7195 return NULL;
7196 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007197#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007198 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007200
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202 Py_XDECREF(errorHandler);
7203 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 Py_XDECREF(v);
7205 return NULL;
7206}
7207
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007208/* Charmap encoding: the lookup table */
7209
Alexander Belopolsky40018472011-02-26 01:02:56 +00007210struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 PyObject_HEAD
7212 unsigned char level1[32];
7213 int count2, count3;
7214 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007215};
7216
7217static PyObject*
7218encoding_map_size(PyObject *obj, PyObject* args)
7219{
7220 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007221 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007223}
7224
7225static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007226 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 PyDoc_STR("Return the size (in bytes) of this object") },
7228 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007229};
7230
7231static void
7232encoding_map_dealloc(PyObject* o)
7233{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007234 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007235}
7236
7237static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007238 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 "EncodingMap", /*tp_name*/
7240 sizeof(struct encoding_map), /*tp_basicsize*/
7241 0, /*tp_itemsize*/
7242 /* methods */
7243 encoding_map_dealloc, /*tp_dealloc*/
7244 0, /*tp_print*/
7245 0, /*tp_getattr*/
7246 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007247 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 0, /*tp_repr*/
7249 0, /*tp_as_number*/
7250 0, /*tp_as_sequence*/
7251 0, /*tp_as_mapping*/
7252 0, /*tp_hash*/
7253 0, /*tp_call*/
7254 0, /*tp_str*/
7255 0, /*tp_getattro*/
7256 0, /*tp_setattro*/
7257 0, /*tp_as_buffer*/
7258 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7259 0, /*tp_doc*/
7260 0, /*tp_traverse*/
7261 0, /*tp_clear*/
7262 0, /*tp_richcompare*/
7263 0, /*tp_weaklistoffset*/
7264 0, /*tp_iter*/
7265 0, /*tp_iternext*/
7266 encoding_map_methods, /*tp_methods*/
7267 0, /*tp_members*/
7268 0, /*tp_getset*/
7269 0, /*tp_base*/
7270 0, /*tp_dict*/
7271 0, /*tp_descr_get*/
7272 0, /*tp_descr_set*/
7273 0, /*tp_dictoffset*/
7274 0, /*tp_init*/
7275 0, /*tp_alloc*/
7276 0, /*tp_new*/
7277 0, /*tp_free*/
7278 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007279};
7280
7281PyObject*
7282PyUnicode_BuildEncodingMap(PyObject* string)
7283{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007284 PyObject *result;
7285 struct encoding_map *mresult;
7286 int i;
7287 int need_dict = 0;
7288 unsigned char level1[32];
7289 unsigned char level2[512];
7290 unsigned char *mlevel1, *mlevel2, *mlevel3;
7291 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007292 int kind;
7293 void *data;
7294 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007296 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007297 PyErr_BadArgument();
7298 return NULL;
7299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007300 kind = PyUnicode_KIND(string);
7301 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007302 memset(level1, 0xFF, sizeof level1);
7303 memset(level2, 0xFF, sizeof level2);
7304
7305 /* If there isn't a one-to-one mapping of NULL to \0,
7306 or if there are non-BMP characters, we need to use
7307 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007308 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007309 need_dict = 1;
7310 for (i = 1; i < 256; i++) {
7311 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007312 ch = PyUnicode_READ(kind, data, i);
7313 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007314 need_dict = 1;
7315 break;
7316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007317 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007318 /* unmapped character */
7319 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320 l1 = ch >> 11;
7321 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007322 if (level1[l1] == 0xFF)
7323 level1[l1] = count2++;
7324 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007325 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007326 }
7327
7328 if (count2 >= 0xFF || count3 >= 0xFF)
7329 need_dict = 1;
7330
7331 if (need_dict) {
7332 PyObject *result = PyDict_New();
7333 PyObject *key, *value;
7334 if (!result)
7335 return NULL;
7336 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007337 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007338 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007339 if (!key || !value)
7340 goto failed1;
7341 if (PyDict_SetItem(result, key, value) == -1)
7342 goto failed1;
7343 Py_DECREF(key);
7344 Py_DECREF(value);
7345 }
7346 return result;
7347 failed1:
7348 Py_XDECREF(key);
7349 Py_XDECREF(value);
7350 Py_DECREF(result);
7351 return NULL;
7352 }
7353
7354 /* Create a three-level trie */
7355 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7356 16*count2 + 128*count3 - 1);
7357 if (!result)
7358 return PyErr_NoMemory();
7359 PyObject_Init(result, &EncodingMapType);
7360 mresult = (struct encoding_map*)result;
7361 mresult->count2 = count2;
7362 mresult->count3 = count3;
7363 mlevel1 = mresult->level1;
7364 mlevel2 = mresult->level23;
7365 mlevel3 = mresult->level23 + 16*count2;
7366 memcpy(mlevel1, level1, 32);
7367 memset(mlevel2, 0xFF, 16*count2);
7368 memset(mlevel3, 0, 128*count3);
7369 count3 = 0;
7370 for (i = 1; i < 256; i++) {
7371 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007372 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007373 /* unmapped character */
7374 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007375 o1 = PyUnicode_READ(kind, data, i)>>11;
7376 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007377 i2 = 16*mlevel1[o1] + o2;
7378 if (mlevel2[i2] == 0xFF)
7379 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007380 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007381 i3 = 128*mlevel2[i2] + o3;
7382 mlevel3[i3] = i;
7383 }
7384 return result;
7385}
7386
7387static int
7388encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7389{
7390 struct encoding_map *map = (struct encoding_map*)mapping;
7391 int l1 = c>>11;
7392 int l2 = (c>>7) & 0xF;
7393 int l3 = c & 0x7F;
7394 int i;
7395
7396#ifdef Py_UNICODE_WIDE
7397 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007399 }
7400#endif
7401 if (c == 0)
7402 return 0;
7403 /* level 1*/
7404 i = map->level1[l1];
7405 if (i == 0xFF) {
7406 return -1;
7407 }
7408 /* level 2*/
7409 i = map->level23[16*i+l2];
7410 if (i == 0xFF) {
7411 return -1;
7412 }
7413 /* level 3 */
7414 i = map->level23[16*map->count2 + 128*i + l3];
7415 if (i == 0) {
7416 return -1;
7417 }
7418 return i;
7419}
7420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007421/* Lookup the character ch in the mapping. If the character
7422 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007423 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007424static PyObject *
7425charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426{
Christian Heimes217cfd12007-12-02 14:31:20 +00007427 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007428 PyObject *x;
7429
7430 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007432 x = PyObject_GetItem(mapping, w);
7433 Py_DECREF(w);
7434 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7436 /* No mapping found means: mapping is undefined. */
7437 PyErr_Clear();
7438 x = Py_None;
7439 Py_INCREF(x);
7440 return x;
7441 } else
7442 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007444 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007446 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 long value = PyLong_AS_LONG(x);
7448 if (value < 0 || value > 255) {
7449 PyErr_SetString(PyExc_TypeError,
7450 "character mapping must be in range(256)");
7451 Py_DECREF(x);
7452 return NULL;
7453 }
7454 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007456 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 /* wrong return value */
7460 PyErr_Format(PyExc_TypeError,
7461 "character mapping must return integer, bytes or None, not %.400s",
7462 x->ob_type->tp_name);
7463 Py_DECREF(x);
7464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 }
7466}
7467
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007468static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007469charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007470{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007471 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7472 /* exponentially overallocate to minimize reallocations */
7473 if (requiredsize < 2*outsize)
7474 requiredsize = 2*outsize;
7475 if (_PyBytes_Resize(outobj, requiredsize))
7476 return -1;
7477 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007478}
7479
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007482} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007483/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007484 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007485 space is available. Return a new reference to the object that
7486 was put in the output buffer, or Py_None, if the mapping was undefined
7487 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007488 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007489static charmapencode_result
7490charmapencode_output(Py_UNICODE c, PyObject *mapping,
7491 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007492{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007493 PyObject *rep;
7494 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007495 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007496
Christian Heimes90aa7642007-12-19 02:45:37 +00007497 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007498 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007500 if (res == -1)
7501 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 if (outsize<requiredsize)
7503 if (charmapencode_resize(outobj, outpos, requiredsize))
7504 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007505 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 outstart[(*outpos)++] = (char)res;
7507 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007508 }
7509
7510 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007511 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007513 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 Py_DECREF(rep);
7515 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007516 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 if (PyLong_Check(rep)) {
7518 Py_ssize_t requiredsize = *outpos+1;
7519 if (outsize<requiredsize)
7520 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7521 Py_DECREF(rep);
7522 return enc_EXCEPTION;
7523 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007524 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007526 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 else {
7528 const char *repchars = PyBytes_AS_STRING(rep);
7529 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7530 Py_ssize_t requiredsize = *outpos+repsize;
7531 if (outsize<requiredsize)
7532 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7533 Py_DECREF(rep);
7534 return enc_EXCEPTION;
7535 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007536 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 memcpy(outstart + *outpos, repchars, repsize);
7538 *outpos += repsize;
7539 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007540 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007541 Py_DECREF(rep);
7542 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007543}
7544
7545/* handle an error in PyUnicode_EncodeCharmap
7546 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007547static int
7548charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007549 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007551 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007552 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007553{
7554 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007555 Py_ssize_t repsize;
7556 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557 Py_UNICODE *uni2;
7558 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007559 Py_ssize_t collstartpos = *inpos;
7560 Py_ssize_t collendpos = *inpos+1;
7561 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007562 char *encoding = "charmap";
7563 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007564 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007566 /* find all unencodable characters */
7567 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007568 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007569 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 int res = encoding_map_lookup(p[collendpos], mapping);
7571 if (res != -1)
7572 break;
7573 ++collendpos;
7574 continue;
7575 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007576
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 rep = charmapencode_lookup(p[collendpos], mapping);
7578 if (rep==NULL)
7579 return -1;
7580 else if (rep!=Py_None) {
7581 Py_DECREF(rep);
7582 break;
7583 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007584 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007586 }
7587 /* cache callback name lookup
7588 * (if not done yet, i.e. it's the first error) */
7589 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 if ((errors==NULL) || (!strcmp(errors, "strict")))
7591 *known_errorHandler = 1;
7592 else if (!strcmp(errors, "replace"))
7593 *known_errorHandler = 2;
7594 else if (!strcmp(errors, "ignore"))
7595 *known_errorHandler = 3;
7596 else if (!strcmp(errors, "xmlcharrefreplace"))
7597 *known_errorHandler = 4;
7598 else
7599 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007600 }
7601 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007602 case 1: /* strict */
7603 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7604 return -1;
7605 case 2: /* replace */
7606 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 x = charmapencode_output('?', mapping, res, respos);
7608 if (x==enc_EXCEPTION) {
7609 return -1;
7610 }
7611 else if (x==enc_FAILED) {
7612 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7613 return -1;
7614 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007615 }
7616 /* fall through */
7617 case 3: /* ignore */
7618 *inpos = collendpos;
7619 break;
7620 case 4: /* xmlcharrefreplace */
7621 /* generate replacement (temporarily (mis)uses p) */
7622 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 char buffer[2+29+1+1];
7624 char *cp;
7625 sprintf(buffer, "&#%d;", (int)p[collpos]);
7626 for (cp = buffer; *cp; ++cp) {
7627 x = charmapencode_output(*cp, mapping, res, respos);
7628 if (x==enc_EXCEPTION)
7629 return -1;
7630 else if (x==enc_FAILED) {
7631 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7632 return -1;
7633 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 }
7635 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007636 *inpos = collendpos;
7637 break;
7638 default:
7639 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 encoding, reason, p, size, exceptionObject,
7641 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007642 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007644 if (PyBytes_Check(repunicode)) {
7645 /* Directly copy bytes result to output. */
7646 Py_ssize_t outsize = PyBytes_Size(*res);
7647 Py_ssize_t requiredsize;
7648 repsize = PyBytes_Size(repunicode);
7649 requiredsize = *respos + repsize;
7650 if (requiredsize > outsize)
7651 /* Make room for all additional bytes. */
7652 if (charmapencode_resize(res, respos, requiredsize)) {
7653 Py_DECREF(repunicode);
7654 return -1;
7655 }
7656 memcpy(PyBytes_AsString(*res) + *respos,
7657 PyBytes_AsString(repunicode), repsize);
7658 *respos += repsize;
7659 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007660 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007661 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007663 /* generate replacement */
7664 repsize = PyUnicode_GET_SIZE(repunicode);
7665 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 x = charmapencode_output(*uni2, mapping, res, respos);
7667 if (x==enc_EXCEPTION) {
7668 return -1;
7669 }
7670 else if (x==enc_FAILED) {
7671 Py_DECREF(repunicode);
7672 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7673 return -1;
7674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 }
7676 *inpos = newpos;
7677 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678 }
7679 return 0;
7680}
7681
Alexander Belopolsky40018472011-02-26 01:02:56 +00007682PyObject *
7683PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7684 Py_ssize_t size,
7685 PyObject *mapping,
7686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007688 /* output object */
7689 PyObject *res = NULL;
7690 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007691 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007692 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 PyObject *errorHandler = NULL;
7695 PyObject *exc = NULL;
7696 /* the following variable is used for caching string comparisons
7697 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7698 * 3=ignore, 4=xmlcharrefreplace */
7699 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
7701 /* Default to Latin-1 */
7702 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 /* allocate enough for a simple encoding without
7706 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007707 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708 if (res == NULL)
7709 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007710 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 /* try to encode it */
7715 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7716 if (x==enc_EXCEPTION) /* error */
7717 goto onError;
7718 if (x==enc_FAILED) { /* unencodable character */
7719 if (charmap_encoding_error(p, size, &inpos, mapping,
7720 &exc,
7721 &known_errorHandler, &errorHandler, errors,
7722 &res, &respos)) {
7723 goto onError;
7724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 else
7727 /* done with this character => adjust input position */
7728 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007732 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007733 if (_PyBytes_Resize(&res, respos) < 0)
7734 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 Py_XDECREF(exc);
7737 Py_XDECREF(errorHandler);
7738 return res;
7739
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741 Py_XDECREF(res);
7742 Py_XDECREF(exc);
7743 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 return NULL;
7745}
7746
Alexander Belopolsky40018472011-02-26 01:02:56 +00007747PyObject *
7748PyUnicode_AsCharmapString(PyObject *unicode,
7749 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750{
7751 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 PyErr_BadArgument();
7753 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 }
7755 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 PyUnicode_GET_SIZE(unicode),
7757 mapping,
7758 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759}
7760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007761/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007762static void
7763make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007764 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007765 Py_ssize_t startpos, Py_ssize_t endpos,
7766 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007768 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007769 *exceptionObject = _PyUnicodeTranslateError_Create(
7770 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 }
7772 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7774 goto onError;
7775 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7776 goto onError;
7777 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7778 goto onError;
7779 return;
7780 onError:
7781 Py_DECREF(*exceptionObject);
7782 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 }
7784}
7785
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007787static void
7788raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007789 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007790 Py_ssize_t startpos, Py_ssize_t endpos,
7791 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007792{
7793 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007794 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007795 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007797}
7798
7799/* error handling callback helper:
7800 build arguments, call the callback and check the arguments,
7801 put the result into newpos and return the replacement string, which
7802 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007803static PyObject *
7804unicode_translate_call_errorhandler(const char *errors,
7805 PyObject **errorHandler,
7806 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007807 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007808 Py_ssize_t startpos, Py_ssize_t endpos,
7809 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007811 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007813 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814 PyObject *restuple;
7815 PyObject *resunicode;
7816
7817 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 }
7822
7823 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827
7828 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007830 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007832 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007833 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 Py_DECREF(restuple);
7835 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007836 }
7837 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 &resunicode, &i_newpos)) {
7839 Py_DECREF(restuple);
7840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007841 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007842 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007844 else
7845 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7848 Py_DECREF(restuple);
7849 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007850 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007851 Py_INCREF(resunicode);
7852 Py_DECREF(restuple);
7853 return resunicode;
7854}
7855
7856/* Lookup the character ch in the mapping and put the result in result,
7857 which must be decrefed by the caller.
7858 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007859static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007861{
Christian Heimes217cfd12007-12-02 14:31:20 +00007862 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863 PyObject *x;
7864
7865 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007867 x = PyObject_GetItem(mapping, w);
7868 Py_DECREF(w);
7869 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7871 /* No mapping found means: use 1:1 mapping. */
7872 PyErr_Clear();
7873 *result = NULL;
7874 return 0;
7875 } else
7876 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007877 }
7878 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 *result = x;
7880 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007882 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 long value = PyLong_AS_LONG(x);
7884 long max = PyUnicode_GetMax();
7885 if (value < 0 || value > max) {
7886 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007887 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 Py_DECREF(x);
7889 return -1;
7890 }
7891 *result = x;
7892 return 0;
7893 }
7894 else if (PyUnicode_Check(x)) {
7895 *result = x;
7896 return 0;
7897 }
7898 else {
7899 /* wrong return value */
7900 PyErr_SetString(PyExc_TypeError,
7901 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007902 Py_DECREF(x);
7903 return -1;
7904 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007905}
7906/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 if not reallocate and adjust various state variables.
7908 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007909static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007910charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007914 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 /* exponentially overallocate to minimize reallocations */
7916 if (requiredsize < 2 * oldsize)
7917 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007918 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7919 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007921 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007922 }
7923 return 0;
7924}
7925/* lookup the character, put the result in the output string and adjust
7926 various state variables. Return a new reference to the object that
7927 was put in the output buffer in *result, or Py_None, if the mapping was
7928 undefined (in which case no character was written).
7929 The called must decref result.
7930 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007931static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7933 PyObject *mapping, Py_UCS4 **output,
7934 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007935 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007937 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7938 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007940 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007942 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 }
7944 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007946 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007948 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 }
7950 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007951 Py_ssize_t repsize;
7952 if (PyUnicode_READY(*res) == -1)
7953 return -1;
7954 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 if (repsize==1) {
7956 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007957 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 }
7959 else if (repsize!=0) {
7960 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007961 Py_ssize_t requiredsize = *opos +
7962 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007964 Py_ssize_t i;
7965 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007967 for(i = 0; i < repsize; i++)
7968 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970 }
7971 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007973 return 0;
7974}
7975
Alexander Belopolsky40018472011-02-26 01:02:56 +00007976PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007977_PyUnicode_TranslateCharmap(PyObject *input,
7978 PyObject *mapping,
7979 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 /* input object */
7982 char *idata;
7983 Py_ssize_t size, i;
7984 int kind;
7985 /* output buffer */
7986 Py_UCS4 *output = NULL;
7987 Py_ssize_t osize;
7988 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007990 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007991 char *reason = "character maps to <undefined>";
7992 PyObject *errorHandler = NULL;
7993 PyObject *exc = NULL;
7994 /* the following variable is used for caching string comparisons
7995 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7996 * 3=ignore, 4=xmlcharrefreplace */
7997 int known_errorHandler = -1;
7998
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 PyErr_BadArgument();
8001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008004 if (PyUnicode_READY(input) == -1)
8005 return NULL;
8006 idata = (char*)PyUnicode_DATA(input);
8007 kind = PyUnicode_KIND(input);
8008 size = PyUnicode_GET_LENGTH(input);
8009 i = 0;
8010
8011 if (size == 0) {
8012 Py_INCREF(input);
8013 return input;
8014 }
8015
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008016 /* allocate enough for a simple 1:1 translation without
8017 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008018 osize = size;
8019 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8020 opos = 0;
8021 if (output == NULL) {
8022 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 /* try to encode it */
8028 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 if (charmaptranslate_output(input, i, mapping,
8030 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 Py_XDECREF(x);
8032 goto onError;
8033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 else { /* untranslatable character */
8038 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8039 Py_ssize_t repsize;
8040 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008041 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008043 Py_ssize_t collstart = i;
8044 Py_ssize_t collend = i+1;
8045 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008048 while (collend < size) {
8049 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 goto onError;
8051 Py_XDECREF(x);
8052 if (x!=Py_None)
8053 break;
8054 ++collend;
8055 }
8056 /* cache callback name lookup
8057 * (if not done yet, i.e. it's the first error) */
8058 if (known_errorHandler==-1) {
8059 if ((errors==NULL) || (!strcmp(errors, "strict")))
8060 known_errorHandler = 1;
8061 else if (!strcmp(errors, "replace"))
8062 known_errorHandler = 2;
8063 else if (!strcmp(errors, "ignore"))
8064 known_errorHandler = 3;
8065 else if (!strcmp(errors, "xmlcharrefreplace"))
8066 known_errorHandler = 4;
8067 else
8068 known_errorHandler = 0;
8069 }
8070 switch (known_errorHandler) {
8071 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008072 raise_translate_exception(&exc, input, collstart,
8073 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 case 2: /* replace */
8076 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077 for (coll = collstart; coll<collend; coll++)
8078 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 /* fall through */
8080 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 break;
8083 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008084 /* generate replacement (temporarily (mis)uses i) */
8085 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 char buffer[2+29+1+1];
8087 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008088 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8089 if (charmaptranslate_makespace(&output, &osize,
8090 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 goto onError;
8092 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008093 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 break;
8097 default:
8098 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008099 reason, input, &exc,
8100 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008101 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 goto onError;
8103 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 repsize = PyUnicode_GET_LENGTH(repunicode);
8105 if (charmaptranslate_makespace(&output, &osize,
8106 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 Py_DECREF(repunicode);
8108 goto onError;
8109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 for (uni2 = 0; repsize-->0; ++uni2)
8111 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8112 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008115 }
8116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008117 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8118 if (!res)
8119 goto onError;
8120 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 Py_XDECREF(exc);
8122 Py_XDECREF(errorHandler);
8123 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 Py_XDECREF(exc);
8128 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 return NULL;
8130}
8131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132/* Deprecated. Use PyUnicode_Translate instead. */
8133PyObject *
8134PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8135 Py_ssize_t size,
8136 PyObject *mapping,
8137 const char *errors)
8138{
8139 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8140 if (!unicode)
8141 return NULL;
8142 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8143}
8144
Alexander Belopolsky40018472011-02-26 01:02:56 +00008145PyObject *
8146PyUnicode_Translate(PyObject *str,
8147 PyObject *mapping,
8148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149{
8150 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008151
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 str = PyUnicode_FromObject(str);
8153 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 Py_DECREF(str);
8157 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008158
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 Py_XDECREF(str);
8161 return NULL;
8162}
Tim Petersced69f82003-09-16 20:30:58 +00008163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008165fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166{
8167 /* No need to call PyUnicode_READY(self) because this function is only
8168 called as a callback from fixup() which does it already. */
8169 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8170 const int kind = PyUnicode_KIND(self);
8171 void *data = PyUnicode_DATA(self);
8172 Py_UCS4 maxchar = 0, ch, fixed;
8173 Py_ssize_t i;
8174
8175 for (i = 0; i < len; ++i) {
8176 ch = PyUnicode_READ(kind, data, i);
8177 fixed = 0;
8178 if (ch > 127) {
8179 if (Py_UNICODE_ISSPACE(ch))
8180 fixed = ' ';
8181 else {
8182 const int decimal = Py_UNICODE_TODECIMAL(ch);
8183 if (decimal >= 0)
8184 fixed = '0' + decimal;
8185 }
8186 if (fixed != 0) {
8187 if (fixed > maxchar)
8188 maxchar = fixed;
8189 PyUnicode_WRITE(kind, data, i, fixed);
8190 }
8191 else if (ch > maxchar)
8192 maxchar = ch;
8193 }
8194 else if (ch > maxchar)
8195 maxchar = ch;
8196 }
8197
8198 return maxchar;
8199}
8200
8201PyObject *
8202_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8203{
8204 if (!PyUnicode_Check(unicode)) {
8205 PyErr_BadInternalCall();
8206 return NULL;
8207 }
8208 if (PyUnicode_READY(unicode) == -1)
8209 return NULL;
8210 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8211 /* If the string is already ASCII, just return the same string */
8212 Py_INCREF(unicode);
8213 return unicode;
8214 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008215 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008216}
8217
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008218PyObject *
8219PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8220 Py_ssize_t length)
8221{
8222 PyObject *result;
8223 Py_UNICODE *p; /* write pointer into result */
8224 Py_ssize_t i;
8225 /* Copy to a new string */
8226 result = (PyObject *)_PyUnicode_New(length);
8227 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8228 if (result == NULL)
8229 return result;
8230 p = PyUnicode_AS_UNICODE(result);
8231 /* Iterate over code points */
8232 for (i = 0; i < length; i++) {
8233 Py_UNICODE ch =s[i];
8234 if (ch > 127) {
8235 int decimal = Py_UNICODE_TODECIMAL(ch);
8236 if (decimal >= 0)
8237 p[i] = '0' + decimal;
8238 }
8239 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008240#ifndef DONT_MAKE_RESULT_READY
8241 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 Py_DECREF(result);
8243 return NULL;
8244 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008245#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008246 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008247 return result;
8248}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008249/* --- Decimal Encoder ---------------------------------------------------- */
8250
Alexander Belopolsky40018472011-02-26 01:02:56 +00008251int
8252PyUnicode_EncodeDecimal(Py_UNICODE *s,
8253 Py_ssize_t length,
8254 char *output,
8255 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008256{
8257 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 PyObject *errorHandler = NULL;
8259 PyObject *exc = NULL;
8260 const char *encoding = "decimal";
8261 const char *reason = "invalid decimal Unicode string";
8262 /* the following variable is used for caching string comparisons
8263 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8264 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008265
8266 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 PyErr_BadArgument();
8268 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008269 }
8270
8271 p = s;
8272 end = s + length;
8273 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 register Py_UNICODE ch = *p;
8275 int decimal;
8276 PyObject *repunicode;
8277 Py_ssize_t repsize;
8278 Py_ssize_t newpos;
8279 Py_UNICODE *uni2;
8280 Py_UNICODE *collstart;
8281 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008282
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008284 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 ++p;
8286 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008287 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 decimal = Py_UNICODE_TODECIMAL(ch);
8289 if (decimal >= 0) {
8290 *output++ = '0' + decimal;
8291 ++p;
8292 continue;
8293 }
8294 if (0 < ch && ch < 256) {
8295 *output++ = (char)ch;
8296 ++p;
8297 continue;
8298 }
8299 /* All other characters are considered unencodable */
8300 collstart = p;
8301 collend = p+1;
8302 while (collend < end) {
8303 if ((0 < *collend && *collend < 256) ||
8304 !Py_UNICODE_ISSPACE(*collend) ||
8305 Py_UNICODE_TODECIMAL(*collend))
8306 break;
8307 }
8308 /* cache callback name lookup
8309 * (if not done yet, i.e. it's the first error) */
8310 if (known_errorHandler==-1) {
8311 if ((errors==NULL) || (!strcmp(errors, "strict")))
8312 known_errorHandler = 1;
8313 else if (!strcmp(errors, "replace"))
8314 known_errorHandler = 2;
8315 else if (!strcmp(errors, "ignore"))
8316 known_errorHandler = 3;
8317 else if (!strcmp(errors, "xmlcharrefreplace"))
8318 known_errorHandler = 4;
8319 else
8320 known_errorHandler = 0;
8321 }
8322 switch (known_errorHandler) {
8323 case 1: /* strict */
8324 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8325 goto onError;
8326 case 2: /* replace */
8327 for (p = collstart; p < collend; ++p)
8328 *output++ = '?';
8329 /* fall through */
8330 case 3: /* ignore */
8331 p = collend;
8332 break;
8333 case 4: /* xmlcharrefreplace */
8334 /* generate replacement (temporarily (mis)uses p) */
8335 for (p = collstart; p < collend; ++p)
8336 output += sprintf(output, "&#%d;", (int)*p);
8337 p = collend;
8338 break;
8339 default:
8340 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8341 encoding, reason, s, length, &exc,
8342 collstart-s, collend-s, &newpos);
8343 if (repunicode == NULL)
8344 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008345 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008346 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008347 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8348 Py_DECREF(repunicode);
8349 goto onError;
8350 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 /* generate replacement */
8352 repsize = PyUnicode_GET_SIZE(repunicode);
8353 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8354 Py_UNICODE ch = *uni2;
8355 if (Py_UNICODE_ISSPACE(ch))
8356 *output++ = ' ';
8357 else {
8358 decimal = Py_UNICODE_TODECIMAL(ch);
8359 if (decimal >= 0)
8360 *output++ = '0' + decimal;
8361 else if (0 < ch && ch < 256)
8362 *output++ = (char)ch;
8363 else {
8364 Py_DECREF(repunicode);
8365 raise_encode_exception(&exc, encoding,
8366 s, length, collstart-s, collend-s, reason);
8367 goto onError;
8368 }
8369 }
8370 }
8371 p = s + newpos;
8372 Py_DECREF(repunicode);
8373 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008374 }
8375 /* 0-terminate the output string */
8376 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 Py_XDECREF(exc);
8378 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008379 return 0;
8380
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 Py_XDECREF(exc);
8383 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008384 return -1;
8385}
8386
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387/* --- Helpers ------------------------------------------------------------ */
8388
Victor Stinnerc3cec782011-10-05 21:24:08 +02008389#include "stringlib/asciilib.h"
8390#include "stringlib/fastsearch.h"
8391#include "stringlib/partition.h"
8392#include "stringlib/split.h"
8393#include "stringlib/count.h"
8394#include "stringlib/find.h"
8395#include "stringlib/localeutil.h"
8396#include "stringlib/undef.h"
8397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398#include "stringlib/ucs1lib.h"
8399#include "stringlib/fastsearch.h"
8400#include "stringlib/partition.h"
8401#include "stringlib/split.h"
8402#include "stringlib/count.h"
8403#include "stringlib/find.h"
8404#include "stringlib/localeutil.h"
8405#include "stringlib/undef.h"
8406
8407#include "stringlib/ucs2lib.h"
8408#include "stringlib/fastsearch.h"
8409#include "stringlib/partition.h"
8410#include "stringlib/split.h"
8411#include "stringlib/count.h"
8412#include "stringlib/find.h"
8413#include "stringlib/localeutil.h"
8414#include "stringlib/undef.h"
8415
8416#include "stringlib/ucs4lib.h"
8417#include "stringlib/fastsearch.h"
8418#include "stringlib/partition.h"
8419#include "stringlib/split.h"
8420#include "stringlib/count.h"
8421#include "stringlib/find.h"
8422#include "stringlib/localeutil.h"
8423#include "stringlib/undef.h"
8424
8425static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008426any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8427 const Py_UCS1*, Py_ssize_t,
8428 Py_ssize_t, Py_ssize_t),
8429 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 const Py_UCS1*, Py_ssize_t,
8431 Py_ssize_t, Py_ssize_t),
8432 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8433 const Py_UCS2*, Py_ssize_t,
8434 Py_ssize_t, Py_ssize_t),
8435 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8436 const Py_UCS4*, Py_ssize_t,
8437 Py_ssize_t, Py_ssize_t),
8438 PyObject* s1, PyObject* s2,
8439 Py_ssize_t start,
8440 Py_ssize_t end)
8441{
8442 int kind1, kind2, kind;
8443 void *buf1, *buf2;
8444 Py_ssize_t len1, len2, result;
8445
8446 kind1 = PyUnicode_KIND(s1);
8447 kind2 = PyUnicode_KIND(s2);
8448 kind = kind1 > kind2 ? kind1 : kind2;
8449 buf1 = PyUnicode_DATA(s1);
8450 buf2 = PyUnicode_DATA(s2);
8451 if (kind1 != kind)
8452 buf1 = _PyUnicode_AsKind(s1, kind);
8453 if (!buf1)
8454 return -2;
8455 if (kind2 != kind)
8456 buf2 = _PyUnicode_AsKind(s2, kind);
8457 if (!buf2) {
8458 if (kind1 != kind) PyMem_Free(buf1);
8459 return -2;
8460 }
8461 len1 = PyUnicode_GET_LENGTH(s1);
8462 len2 = PyUnicode_GET_LENGTH(s2);
8463
8464 switch(kind) {
8465 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008466 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8467 result = ascii(buf1, len1, buf2, len2, start, end);
8468 else
8469 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 break;
8471 case PyUnicode_2BYTE_KIND:
8472 result = ucs2(buf1, len1, buf2, len2, start, end);
8473 break;
8474 case PyUnicode_4BYTE_KIND:
8475 result = ucs4(buf1, len1, buf2, len2, start, end);
8476 break;
8477 default:
8478 assert(0); result = -2;
8479 }
8480
8481 if (kind1 != kind)
8482 PyMem_Free(buf1);
8483 if (kind2 != kind)
8484 PyMem_Free(buf2);
8485
8486 return result;
8487}
8488
8489Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008490_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 Py_ssize_t n_buffer,
8492 void *digits, Py_ssize_t n_digits,
8493 Py_ssize_t min_width,
8494 const char *grouping,
8495 const char *thousands_sep)
8496{
8497 switch(kind) {
8498 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008499 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8500 return _PyUnicode_ascii_InsertThousandsGrouping(
8501 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8502 min_width, grouping, thousands_sep);
8503 else
8504 return _PyUnicode_ucs1_InsertThousandsGrouping(
8505 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8506 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 case PyUnicode_2BYTE_KIND:
8508 return _PyUnicode_ucs2_InsertThousandsGrouping(
8509 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8510 min_width, grouping, thousands_sep);
8511 case PyUnicode_4BYTE_KIND:
8512 return _PyUnicode_ucs4_InsertThousandsGrouping(
8513 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8514 min_width, grouping, thousands_sep);
8515 }
8516 assert(0);
8517 return -1;
8518}
8519
8520
Eric Smith8c663262007-08-25 02:26:07 +00008521#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008522#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008523
Thomas Wouters477c8d52006-05-27 19:21:47 +00008524#include "stringlib/count.h"
8525#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008526
Thomas Wouters477c8d52006-05-27 19:21:47 +00008527/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008528#define ADJUST_INDICES(start, end, len) \
8529 if (end > len) \
8530 end = len; \
8531 else if (end < 0) { \
8532 end += len; \
8533 if (end < 0) \
8534 end = 0; \
8535 } \
8536 if (start < 0) { \
8537 start += len; \
8538 if (start < 0) \
8539 start = 0; \
8540 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008541
Alexander Belopolsky40018472011-02-26 01:02:56 +00008542Py_ssize_t
8543PyUnicode_Count(PyObject *str,
8544 PyObject *substr,
8545 Py_ssize_t start,
8546 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008548 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008549 PyUnicodeObject* str_obj;
8550 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 int kind1, kind2, kind;
8552 void *buf1 = NULL, *buf2 = NULL;
8553 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008554
Thomas Wouters477c8d52006-05-27 19:21:47 +00008555 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008558 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008559 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 Py_DECREF(str_obj);
8561 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 }
Tim Petersced69f82003-09-16 20:30:58 +00008563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 kind1 = PyUnicode_KIND(str_obj);
8565 kind2 = PyUnicode_KIND(sub_obj);
8566 kind = kind1 > kind2 ? kind1 : kind2;
8567 buf1 = PyUnicode_DATA(str_obj);
8568 if (kind1 != kind)
8569 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8570 if (!buf1)
8571 goto onError;
8572 buf2 = PyUnicode_DATA(sub_obj);
8573 if (kind2 != kind)
8574 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8575 if (!buf2)
8576 goto onError;
8577 len1 = PyUnicode_GET_LENGTH(str_obj);
8578 len2 = PyUnicode_GET_LENGTH(sub_obj);
8579
8580 ADJUST_INDICES(start, end, len1);
8581 switch(kind) {
8582 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008583 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8584 result = asciilib_count(
8585 ((Py_UCS1*)buf1) + start, end - start,
8586 buf2, len2, PY_SSIZE_T_MAX
8587 );
8588 else
8589 result = ucs1lib_count(
8590 ((Py_UCS1*)buf1) + start, end - start,
8591 buf2, len2, PY_SSIZE_T_MAX
8592 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 break;
8594 case PyUnicode_2BYTE_KIND:
8595 result = ucs2lib_count(
8596 ((Py_UCS2*)buf1) + start, end - start,
8597 buf2, len2, PY_SSIZE_T_MAX
8598 );
8599 break;
8600 case PyUnicode_4BYTE_KIND:
8601 result = ucs4lib_count(
8602 ((Py_UCS4*)buf1) + start, end - start,
8603 buf2, len2, PY_SSIZE_T_MAX
8604 );
8605 break;
8606 default:
8607 assert(0); result = 0;
8608 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008609
8610 Py_DECREF(sub_obj);
8611 Py_DECREF(str_obj);
8612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 if (kind1 != kind)
8614 PyMem_Free(buf1);
8615 if (kind2 != kind)
8616 PyMem_Free(buf2);
8617
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 onError:
8620 Py_DECREF(sub_obj);
8621 Py_DECREF(str_obj);
8622 if (kind1 != kind && buf1)
8623 PyMem_Free(buf1);
8624 if (kind2 != kind && buf2)
8625 PyMem_Free(buf2);
8626 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627}
8628
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629Py_ssize_t
8630PyUnicode_Find(PyObject *str,
8631 PyObject *sub,
8632 Py_ssize_t start,
8633 Py_ssize_t end,
8634 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008636 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008637
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008641 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 Py_DECREF(str);
8644 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 }
Tim Petersced69f82003-09-16 20:30:58 +00008646
Thomas Wouters477c8d52006-05-27 19:21:47 +00008647 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008649 asciilib_find_slice, ucs1lib_find_slice,
8650 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008652 );
8653 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008655 asciilib_find_slice, ucs1lib_rfind_slice,
8656 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008658 );
8659
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008661 Py_DECREF(sub);
8662
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 return result;
8664}
8665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666Py_ssize_t
8667PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8668 Py_ssize_t start, Py_ssize_t end,
8669 int direction)
8670{
8671 char *result;
8672 int kind;
8673 if (PyUnicode_READY(str) == -1)
8674 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008675 if (start < 0 || end < 0) {
8676 PyErr_SetString(PyExc_IndexError, "string index out of range");
8677 return -2;
8678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 if (end > PyUnicode_GET_LENGTH(str))
8680 end = PyUnicode_GET_LENGTH(str);
8681 kind = PyUnicode_KIND(str);
8682 result = findchar(PyUnicode_1BYTE_DATA(str)
8683 + PyUnicode_KIND_SIZE(kind, start),
8684 kind,
8685 end-start, ch, direction);
8686 if (!result)
8687 return -1;
8688 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8689}
8690
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691static int
8692tailmatch(PyUnicodeObject *self,
8693 PyUnicodeObject *substring,
8694 Py_ssize_t start,
8695 Py_ssize_t end,
8696 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 int kind_self;
8699 int kind_sub;
8700 void *data_self;
8701 void *data_sub;
8702 Py_ssize_t offset;
8703 Py_ssize_t i;
8704 Py_ssize_t end_sub;
8705
8706 if (PyUnicode_READY(self) == -1 ||
8707 PyUnicode_READY(substring) == -1)
8708 return 0;
8709
8710 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 return 1;
8712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8714 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 kind_self = PyUnicode_KIND(self);
8719 data_self = PyUnicode_DATA(self);
8720 kind_sub = PyUnicode_KIND(substring);
8721 data_sub = PyUnicode_DATA(substring);
8722 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8723
8724 if (direction > 0)
8725 offset = end;
8726 else
8727 offset = start;
8728
8729 if (PyUnicode_READ(kind_self, data_self, offset) ==
8730 PyUnicode_READ(kind_sub, data_sub, 0) &&
8731 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8732 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8733 /* If both are of the same kind, memcmp is sufficient */
8734 if (kind_self == kind_sub) {
8735 return ! memcmp((char *)data_self +
8736 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8737 data_sub,
8738 PyUnicode_GET_LENGTH(substring) *
8739 PyUnicode_CHARACTER_SIZE(substring));
8740 }
8741 /* otherwise we have to compare each character by first accesing it */
8742 else {
8743 /* We do not need to compare 0 and len(substring)-1 because
8744 the if statement above ensured already that they are equal
8745 when we end up here. */
8746 // TODO: honor direction and do a forward or backwards search
8747 for (i = 1; i < end_sub; ++i) {
8748 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8749 PyUnicode_READ(kind_sub, data_sub, i))
8750 return 0;
8751 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 }
8755
8756 return 0;
8757}
8758
Alexander Belopolsky40018472011-02-26 01:02:56 +00008759Py_ssize_t
8760PyUnicode_Tailmatch(PyObject *str,
8761 PyObject *substr,
8762 Py_ssize_t start,
8763 Py_ssize_t end,
8764 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008766 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008767
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 str = PyUnicode_FromObject(str);
8769 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 substr = PyUnicode_FromObject(substr);
8772 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 Py_DECREF(str);
8774 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 }
Tim Petersced69f82003-09-16 20:30:58 +00008776
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 (PyUnicodeObject *)substr,
8779 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 Py_DECREF(str);
8781 Py_DECREF(substr);
8782 return result;
8783}
8784
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785/* Apply fixfct filter to the Unicode object self and return a
8786 reference to the modified object */
8787
Alexander Belopolsky40018472011-02-26 01:02:56 +00008788static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008789fixup(PyObject *self,
8790 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 PyObject *u;
8793 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 if (PyUnicode_READY(self) == -1)
8796 return NULL;
8797 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8798 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8799 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8804 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 /* fix functions return the new maximum character in a string,
8807 if the kind of the resulting unicode object does not change,
8808 everything is fine. Otherwise we need to change the string kind
8809 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008810 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 if (maxchar_new == 0)
8812 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8813 else if (maxchar_new <= 127)
8814 maxchar_new = 127;
8815 else if (maxchar_new <= 255)
8816 maxchar_new = 255;
8817 else if (maxchar_new <= 65535)
8818 maxchar_new = 65535;
8819 else
8820 maxchar_new = 1114111; /* 0x10ffff */
8821
8822 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 /* fixfct should return TRUE if it modified the buffer. If
8824 FALSE, return a reference to the original buffer instead
8825 (to save space, not time) */
8826 Py_INCREF(self);
8827 Py_DECREF(u);
8828 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 else if (maxchar_new == maxchar_old) {
8831 return u;
8832 }
8833 else {
8834 /* In case the maximum character changed, we need to
8835 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008836 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 if (v == NULL) {
8838 Py_DECREF(u);
8839 return NULL;
8840 }
8841 if (maxchar_new > maxchar_old) {
8842 /* If the maxchar increased so that the kind changed, not all
8843 characters are representable anymore and we need to fix the
8844 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008845 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008846 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8848 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008849 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008850 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852
8853 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008854 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 return v;
8856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857}
8858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008860fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 /* No need to call PyUnicode_READY(self) because this function is only
8863 called as a callback from fixup() which does it already. */
8864 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8865 const int kind = PyUnicode_KIND(self);
8866 void *data = PyUnicode_DATA(self);
8867 int touched = 0;
8868 Py_UCS4 maxchar = 0;
8869 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 for (i = 0; i < len; ++i) {
8872 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8873 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8874 if (up != ch) {
8875 if (up > maxchar)
8876 maxchar = up;
8877 PyUnicode_WRITE(kind, data, i, up);
8878 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 else if (ch > maxchar)
8881 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 }
8883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 if (touched)
8885 return maxchar;
8886 else
8887 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888}
8889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008891fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8894 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8895 const int kind = PyUnicode_KIND(self);
8896 void *data = PyUnicode_DATA(self);
8897 int touched = 0;
8898 Py_UCS4 maxchar = 0;
8899 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 for(i = 0; i < len; ++i) {
8902 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8903 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8904 if (lo != ch) {
8905 if (lo > maxchar)
8906 maxchar = lo;
8907 PyUnicode_WRITE(kind, data, i, lo);
8908 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 else if (ch > maxchar)
8911 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 }
8913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 if (touched)
8915 return maxchar;
8916 else
8917 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918}
8919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008921fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8924 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8925 const int kind = PyUnicode_KIND(self);
8926 void *data = PyUnicode_DATA(self);
8927 int touched = 0;
8928 Py_UCS4 maxchar = 0;
8929 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 for(i = 0; i < len; ++i) {
8932 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8933 Py_UCS4 nu = 0;
8934
8935 if (Py_UNICODE_ISUPPER(ch))
8936 nu = Py_UNICODE_TOLOWER(ch);
8937 else if (Py_UNICODE_ISLOWER(ch))
8938 nu = Py_UNICODE_TOUPPER(ch);
8939
8940 if (nu != 0) {
8941 if (nu > maxchar)
8942 maxchar = nu;
8943 PyUnicode_WRITE(kind, data, i, nu);
8944 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 else if (ch > maxchar)
8947 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 }
8949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 if (touched)
8951 return maxchar;
8952 else
8953 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954}
8955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008957fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8960 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8961 const int kind = PyUnicode_KIND(self);
8962 void *data = PyUnicode_DATA(self);
8963 int touched = 0;
8964 Py_UCS4 maxchar = 0;
8965 Py_ssize_t i = 0;
8966 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008967
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008968 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970
8971 ch = PyUnicode_READ(kind, data, i);
8972 if (!Py_UNICODE_ISUPPER(ch)) {
8973 maxchar = Py_UNICODE_TOUPPER(ch);
8974 PyUnicode_WRITE(kind, data, i, maxchar);
8975 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 ++i;
8978 for(; i < len; ++i) {
8979 ch = PyUnicode_READ(kind, data, i);
8980 if (!Py_UNICODE_ISLOWER(ch)) {
8981 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8982 if (lo > maxchar)
8983 maxchar = lo;
8984 PyUnicode_WRITE(kind, data, i, lo);
8985 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 else if (ch > maxchar)
8988 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008989 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990
8991 if (touched)
8992 return maxchar;
8993 else
8994 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995}
8996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008998fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9001 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9002 const int kind = PyUnicode_KIND(self);
9003 void *data = PyUnicode_DATA(self);
9004 Py_UCS4 maxchar = 0;
9005 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006 int previous_is_cased;
9007
9008 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (len == 1) {
9010 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9011 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9012 if (ti != ch) {
9013 PyUnicode_WRITE(kind, data, i, ti);
9014 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 }
9016 else
9017 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 for(; i < len; ++i) {
9021 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9022 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009023
Benjamin Peterson29060642009-01-31 22:14:21 +00009024 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 nu = Py_UNICODE_TOTITLE(ch);
9028
9029 if (nu > maxchar)
9030 maxchar = nu;
9031 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009032
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 if (Py_UNICODE_ISLOWER(ch) ||
9034 Py_UNICODE_ISUPPER(ch) ||
9035 Py_UNICODE_ISTITLE(ch))
9036 previous_is_cased = 1;
9037 else
9038 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041}
9042
Tim Peters8ce9f162004-08-27 01:49:32 +00009043PyObject *
9044PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009047 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009049 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009050 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9051 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009052 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009054 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056
Tim Peters05eba1f2004-08-27 21:32:02 +00009057 fseq = PySequence_Fast(seq, "");
9058 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009059 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009060 }
9061
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009062 /* NOTE: the following code can't call back into Python code,
9063 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009064 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009065
Tim Peters05eba1f2004-08-27 21:32:02 +00009066 seqlen = PySequence_Fast_GET_SIZE(fseq);
9067 /* If empty sequence, return u"". */
9068 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009069 Py_DECREF(fseq);
9070 Py_INCREF(unicode_empty);
9071 res = unicode_empty;
9072 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009073 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009074
Tim Peters05eba1f2004-08-27 21:32:02 +00009075 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009076 items = PySequence_Fast_ITEMS(fseq);
9077 if (seqlen == 1 && PyUnicode_CheckExact(items[0])) {
9078 res = items[0];
9079 Py_INCREF(res);
9080 Py_DECREF(fseq);
9081 return res;
9082 }
9083
9084 /* Set up sep and seplen */
9085 if (separator == NULL) {
9086 /* fall back to a blank space separator */
9087 sep = PyUnicode_FromOrdinal(' ');
9088 if (!sep)
9089 goto onError;
9090 maxchar = 32;
Tim Peters8ce9f162004-08-27 01:49:32 +00009091 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009092 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009093 if (!PyUnicode_Check(separator)) {
9094 PyErr_Format(PyExc_TypeError,
9095 "separator: expected str instance,"
9096 " %.80s found",
9097 Py_TYPE(separator)->tp_name);
9098 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00009099 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009100 if (PyUnicode_READY(separator))
9101 goto onError;
9102 sep = separator;
9103 seplen = PyUnicode_GET_LENGTH(separator);
9104 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9105 /* inc refcount to keep this code path symmetric with the
9106 above case of a blank separator */
9107 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00009108 }
9109
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009110 /* There are at least two things to join, or else we have a subclass
9111 * of str in the sequence.
9112 * Do a pre-pass to figure out the total amount of space we'll
9113 * need (sz), and see whether all argument are strings.
9114 */
9115 sz = 0;
9116 for (i = 0; i < seqlen; i++) {
9117 const Py_ssize_t old_sz = sz;
9118 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 if (!PyUnicode_Check(item)) {
9120 PyErr_Format(PyExc_TypeError,
9121 "sequence item %zd: expected str instance,"
9122 " %.80s found",
9123 i, Py_TYPE(item)->tp_name);
9124 goto onError;
9125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 if (PyUnicode_READY(item) == -1)
9127 goto onError;
9128 sz += PyUnicode_GET_LENGTH(item);
9129 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9130 if (item_maxchar > maxchar)
9131 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009132 if (i != 0)
9133 sz += seplen;
9134 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9135 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009137 goto onError;
9138 }
9139 }
Tim Petersced69f82003-09-16 20:30:58 +00009140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009142 if (res == NULL)
9143 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009144
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009145 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009147 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009148 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009150 if (i && seplen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009151 copy_characters(res, res_offset, sep, 0, seplen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009154 itemlen = PyUnicode_GET_LENGTH(item);
9155 if (itemlen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009156 copy_characters(res, res_offset, item, 0, itemlen);
Victor Stinner9ce5a832011-10-03 23:36:02 +02009157 res_offset += itemlen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009158 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009161
Tim Peters05eba1f2004-08-27 21:32:02 +00009162 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009164 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009168 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009170 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171 return NULL;
9172}
9173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174#define FILL(kind, data, value, start, length) \
9175 do { \
9176 Py_ssize_t i_ = 0; \
9177 assert(kind != PyUnicode_WCHAR_KIND); \
9178 switch ((kind)) { \
9179 case PyUnicode_1BYTE_KIND: { \
9180 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9181 memset(to_, (unsigned char)value, length); \
9182 break; \
9183 } \
9184 case PyUnicode_2BYTE_KIND: { \
9185 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9186 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9187 break; \
9188 } \
9189 default: { \
9190 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9191 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9192 break; \
9193 } \
9194 } \
9195 } while (0)
9196
Victor Stinner9310abb2011-10-05 00:59:23 +02009197static PyObject *
9198pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009199 Py_ssize_t left,
9200 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 PyObject *u;
9204 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009205 int kind;
9206 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207
9208 if (left < 0)
9209 left = 0;
9210 if (right < 0)
9211 right = 0;
9212
Tim Peters7a29bd52001-09-12 03:03:31 +00009213 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 Py_INCREF(self);
9215 return self;
9216 }
9217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9219 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009220 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9221 return NULL;
9222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9224 if (fill > maxchar)
9225 maxchar = fill;
9226 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009227 if (!u)
9228 return NULL;
9229
9230 kind = PyUnicode_KIND(u);
9231 data = PyUnicode_DATA(u);
9232 if (left)
9233 FILL(kind, data, fill, 0, left);
9234 if (right)
9235 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009236 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009237 assert(_PyUnicode_CheckConsistency(u, 1));
9238 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241
Alexander Belopolsky40018472011-02-26 01:02:56 +00009242PyObject *
9243PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246
9247 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 switch(PyUnicode_KIND(string)) {
9252 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009253 if (PyUnicode_IS_ASCII(string))
9254 list = asciilib_splitlines(
9255 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9256 PyUnicode_GET_LENGTH(string), keepends);
9257 else
9258 list = ucs1lib_splitlines(
9259 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9260 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 break;
9262 case PyUnicode_2BYTE_KIND:
9263 list = ucs2lib_splitlines(
9264 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9265 PyUnicode_GET_LENGTH(string), keepends);
9266 break;
9267 case PyUnicode_4BYTE_KIND:
9268 list = ucs4lib_splitlines(
9269 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9270 PyUnicode_GET_LENGTH(string), keepends);
9271 break;
9272 default:
9273 assert(0);
9274 list = 0;
9275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276 Py_DECREF(string);
9277 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278}
9279
Alexander Belopolsky40018472011-02-26 01:02:56 +00009280static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009281split(PyObject *self,
9282 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009283 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 int kind1, kind2, kind;
9286 void *buf1, *buf2;
9287 Py_ssize_t len1, len2;
9288 PyObject* out;
9289
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009291 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 if (PyUnicode_READY(self) == -1)
9294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 if (substring == NULL)
9297 switch(PyUnicode_KIND(self)) {
9298 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009299 if (PyUnicode_IS_ASCII(self))
9300 return asciilib_split_whitespace(
9301 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9302 PyUnicode_GET_LENGTH(self), maxcount
9303 );
9304 else
9305 return ucs1lib_split_whitespace(
9306 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9307 PyUnicode_GET_LENGTH(self), maxcount
9308 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 case PyUnicode_2BYTE_KIND:
9310 return ucs2lib_split_whitespace(
9311 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9312 PyUnicode_GET_LENGTH(self), maxcount
9313 );
9314 case PyUnicode_4BYTE_KIND:
9315 return ucs4lib_split_whitespace(
9316 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9317 PyUnicode_GET_LENGTH(self), maxcount
9318 );
9319 default:
9320 assert(0);
9321 return NULL;
9322 }
9323
9324 if (PyUnicode_READY(substring) == -1)
9325 return NULL;
9326
9327 kind1 = PyUnicode_KIND(self);
9328 kind2 = PyUnicode_KIND(substring);
9329 kind = kind1 > kind2 ? kind1 : kind2;
9330 buf1 = PyUnicode_DATA(self);
9331 buf2 = PyUnicode_DATA(substring);
9332 if (kind1 != kind)
9333 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9334 if (!buf1)
9335 return NULL;
9336 if (kind2 != kind)
9337 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9338 if (!buf2) {
9339 if (kind1 != kind) PyMem_Free(buf1);
9340 return NULL;
9341 }
9342 len1 = PyUnicode_GET_LENGTH(self);
9343 len2 = PyUnicode_GET_LENGTH(substring);
9344
9345 switch(kind) {
9346 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009347 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9348 out = asciilib_split(
9349 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9350 else
9351 out = ucs1lib_split(
9352 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 break;
9354 case PyUnicode_2BYTE_KIND:
9355 out = ucs2lib_split(
9356 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9357 break;
9358 case PyUnicode_4BYTE_KIND:
9359 out = ucs4lib_split(
9360 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9361 break;
9362 default:
9363 out = NULL;
9364 }
9365 if (kind1 != kind)
9366 PyMem_Free(buf1);
9367 if (kind2 != kind)
9368 PyMem_Free(buf2);
9369 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370}
9371
Alexander Belopolsky40018472011-02-26 01:02:56 +00009372static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009373rsplit(PyObject *self,
9374 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009375 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 int kind1, kind2, kind;
9378 void *buf1, *buf2;
9379 Py_ssize_t len1, len2;
9380 PyObject* out;
9381
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009382 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009383 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 if (PyUnicode_READY(self) == -1)
9386 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 if (substring == NULL)
9389 switch(PyUnicode_KIND(self)) {
9390 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009391 if (PyUnicode_IS_ASCII(self))
9392 return asciilib_rsplit_whitespace(
9393 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9394 PyUnicode_GET_LENGTH(self), maxcount
9395 );
9396 else
9397 return ucs1lib_rsplit_whitespace(
9398 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9399 PyUnicode_GET_LENGTH(self), maxcount
9400 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 case PyUnicode_2BYTE_KIND:
9402 return ucs2lib_rsplit_whitespace(
9403 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9404 PyUnicode_GET_LENGTH(self), maxcount
9405 );
9406 case PyUnicode_4BYTE_KIND:
9407 return ucs4lib_rsplit_whitespace(
9408 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9409 PyUnicode_GET_LENGTH(self), maxcount
9410 );
9411 default:
9412 assert(0);
9413 return NULL;
9414 }
9415
9416 if (PyUnicode_READY(substring) == -1)
9417 return NULL;
9418
9419 kind1 = PyUnicode_KIND(self);
9420 kind2 = PyUnicode_KIND(substring);
9421 kind = kind1 > kind2 ? kind1 : kind2;
9422 buf1 = PyUnicode_DATA(self);
9423 buf2 = PyUnicode_DATA(substring);
9424 if (kind1 != kind)
9425 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9426 if (!buf1)
9427 return NULL;
9428 if (kind2 != kind)
9429 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9430 if (!buf2) {
9431 if (kind1 != kind) PyMem_Free(buf1);
9432 return NULL;
9433 }
9434 len1 = PyUnicode_GET_LENGTH(self);
9435 len2 = PyUnicode_GET_LENGTH(substring);
9436
9437 switch(kind) {
9438 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009439 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9440 out = asciilib_rsplit(
9441 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9442 else
9443 out = ucs1lib_rsplit(
9444 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 break;
9446 case PyUnicode_2BYTE_KIND:
9447 out = ucs2lib_rsplit(
9448 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9449 break;
9450 case PyUnicode_4BYTE_KIND:
9451 out = ucs4lib_rsplit(
9452 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9453 break;
9454 default:
9455 out = NULL;
9456 }
9457 if (kind1 != kind)
9458 PyMem_Free(buf1);
9459 if (kind2 != kind)
9460 PyMem_Free(buf2);
9461 return out;
9462}
9463
9464static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009465anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9466 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467{
9468 switch(kind) {
9469 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009470 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9471 return asciilib_find(buf1, len1, buf2, len2, offset);
9472 else
9473 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 case PyUnicode_2BYTE_KIND:
9475 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9476 case PyUnicode_4BYTE_KIND:
9477 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9478 }
9479 assert(0);
9480 return -1;
9481}
9482
9483static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009484anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9485 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486{
9487 switch(kind) {
9488 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009489 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9490 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9491 else
9492 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 case PyUnicode_2BYTE_KIND:
9494 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9495 case PyUnicode_4BYTE_KIND:
9496 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9497 }
9498 assert(0);
9499 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009500}
9501
Alexander Belopolsky40018472011-02-26 01:02:56 +00009502static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503replace(PyObject *self, PyObject *str1,
9504 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 PyObject *u;
9507 char *sbuf = PyUnicode_DATA(self);
9508 char *buf1 = PyUnicode_DATA(str1);
9509 char *buf2 = PyUnicode_DATA(str2);
9510 int srelease = 0, release1 = 0, release2 = 0;
9511 int skind = PyUnicode_KIND(self);
9512 int kind1 = PyUnicode_KIND(str1);
9513 int kind2 = PyUnicode_KIND(str2);
9514 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9515 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9516 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517
9518 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009521 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 if (skind < kind1)
9524 /* substring too wide to be present */
9525 goto nothing;
9526
9527 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009528 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009529 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009531 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009533 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 Py_UCS4 u1, u2, maxchar;
9535 int mayshrink, rkind;
9536 u1 = PyUnicode_READ_CHAR(str1, 0);
9537 if (!findchar(sbuf, PyUnicode_KIND(self),
9538 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009539 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 u2 = PyUnicode_READ_CHAR(str2, 0);
9541 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9542 /* Replacing u1 with u2 may cause a maxchar reduction in the
9543 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 if (u2 > maxchar) {
9545 maxchar = u2;
9546 mayshrink = 0;
9547 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009548 else
9549 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009551 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009553 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 rkind = PyUnicode_KIND(u);
9555 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9556 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009557 if (--maxcount < 0)
9558 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 if (mayshrink) {
9562 PyObject *tmp = u;
9563 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9564 PyUnicode_GET_LENGTH(tmp));
9565 Py_DECREF(tmp);
9566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 int rkind = skind;
9569 char *res;
9570 if (kind1 < rkind) {
9571 /* widen substring */
9572 buf1 = _PyUnicode_AsKind(str1, rkind);
9573 if (!buf1) goto error;
9574 release1 = 1;
9575 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009576 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009577 if (i < 0)
9578 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 if (rkind > kind2) {
9580 /* widen replacement */
9581 buf2 = _PyUnicode_AsKind(str2, rkind);
9582 if (!buf2) goto error;
9583 release2 = 1;
9584 }
9585 else if (rkind < kind2) {
9586 /* widen self and buf1 */
9587 rkind = kind2;
9588 if (release1) PyMem_Free(buf1);
9589 sbuf = _PyUnicode_AsKind(self, rkind);
9590 if (!sbuf) goto error;
9591 srelease = 1;
9592 buf1 = _PyUnicode_AsKind(str1, rkind);
9593 if (!buf1) goto error;
9594 release1 = 1;
9595 }
9596 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9597 if (!res) {
9598 PyErr_NoMemory();
9599 goto error;
9600 }
9601 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009602 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9604 buf2,
9605 PyUnicode_KIND_SIZE(rkind, len2));
9606 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009607
9608 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009609 i = anylib_find(rkind, self,
9610 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9611 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009612 if (i == -1)
9613 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9615 buf2,
9616 PyUnicode_KIND_SIZE(rkind, len2));
9617 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619
9620 u = PyUnicode_FromKindAndData(rkind, res, slen);
9621 PyMem_Free(res);
9622 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 Py_ssize_t n, i, j, ires;
9627 Py_ssize_t product, new_size;
9628 int rkind = skind;
9629 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 if (kind1 < rkind) {
9632 buf1 = _PyUnicode_AsKind(str1, rkind);
9633 if (!buf1) goto error;
9634 release1 = 1;
9635 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009636 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009637 if (n == 0)
9638 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639 if (kind2 < rkind) {
9640 buf2 = _PyUnicode_AsKind(str2, rkind);
9641 if (!buf2) goto error;
9642 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 else if (kind2 > rkind) {
9645 rkind = kind2;
9646 sbuf = _PyUnicode_AsKind(self, rkind);
9647 if (!sbuf) goto error;
9648 srelease = 1;
9649 if (release1) PyMem_Free(buf1);
9650 buf1 = _PyUnicode_AsKind(str1, rkind);
9651 if (!buf1) goto error;
9652 release1 = 1;
9653 }
9654 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9655 PyUnicode_GET_LENGTH(str1))); */
9656 product = n * (len2-len1);
9657 if ((product / (len2-len1)) != n) {
9658 PyErr_SetString(PyExc_OverflowError,
9659 "replace string is too long");
9660 goto error;
9661 }
9662 new_size = slen + product;
9663 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9664 PyErr_SetString(PyExc_OverflowError,
9665 "replace string is too long");
9666 goto error;
9667 }
9668 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9669 if (!res)
9670 goto error;
9671 ires = i = 0;
9672 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009673 while (n-- > 0) {
9674 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009675 j = anylib_find(rkind, self,
9676 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9677 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009678 if (j == -1)
9679 break;
9680 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009681 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9683 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9684 PyUnicode_KIND_SIZE(rkind, j-i));
9685 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009686 }
9687 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688 if (len2 > 0) {
9689 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9690 buf2,
9691 PyUnicode_KIND_SIZE(rkind, len2));
9692 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009697 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9699 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9700 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009701 } else {
9702 /* interleave */
9703 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9705 buf2,
9706 PyUnicode_KIND_SIZE(rkind, len2));
9707 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009708 if (--n <= 0)
9709 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9711 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9712 PyUnicode_KIND_SIZE(rkind, 1));
9713 ires++;
9714 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009715 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9717 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9718 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009719 }
Victor Stinnerf48323e2011-10-05 23:27:08 +02009720 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(str2))
9721 u = unicode_fromascii((unsigned char*)res, new_size);
9722 else
9723 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009724 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 if (srelease)
9727 PyMem_FREE(sbuf);
9728 if (release1)
9729 PyMem_FREE(buf1);
9730 if (release2)
9731 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009732 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009734
Benjamin Peterson29060642009-01-31 22:14:21 +00009735 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009736 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 if (srelease)
9738 PyMem_FREE(sbuf);
9739 if (release1)
9740 PyMem_FREE(buf1);
9741 if (release2)
9742 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009743 if (PyUnicode_CheckExact(self)) {
9744 Py_INCREF(self);
9745 return (PyObject *) self;
9746 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009747 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 error:
9749 if (srelease && sbuf)
9750 PyMem_FREE(sbuf);
9751 if (release1 && buf1)
9752 PyMem_FREE(buf1);
9753 if (release2 && buf2)
9754 PyMem_FREE(buf2);
9755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756}
9757
9758/* --- Unicode Object Methods --------------------------------------------- */
9759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009760PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762\n\
9763Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009764characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765
9766static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009767unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769 return fixup(self, fixtitle);
9770}
9771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009772PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009773 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774\n\
9775Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009776have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777
9778static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009779unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781 return fixup(self, fixcapitalize);
9782}
9783
9784#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009785PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787\n\
9788Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009789normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790
9791static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009792unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793{
9794 PyObject *list;
9795 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009796 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798 /* Split into words */
9799 list = split(self, NULL, -1);
9800 if (!list)
9801 return NULL;
9802
9803 /* Capitalize each word */
9804 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9805 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009806 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807 if (item == NULL)
9808 goto onError;
9809 Py_DECREF(PyList_GET_ITEM(list, i));
9810 PyList_SET_ITEM(list, i, item);
9811 }
9812
9813 /* Join the words to form a new string */
9814 item = PyUnicode_Join(NULL, list);
9815
Benjamin Peterson29060642009-01-31 22:14:21 +00009816 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817 Py_DECREF(list);
9818 return (PyObject *)item;
9819}
9820#endif
9821
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009822/* Argument converter. Coerces to a single unicode character */
9823
9824static int
9825convert_uc(PyObject *obj, void *addr)
9826{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009828 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009829
Benjamin Peterson14339b62009-01-31 16:36:08 +00009830 uniobj = PyUnicode_FromObject(obj);
9831 if (uniobj == NULL) {
9832 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 return 0;
9835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009837 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009838 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009839 Py_DECREF(uniobj);
9840 return 0;
9841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009843 Py_DECREF(uniobj);
9844 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009845}
9846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009847PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009848 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009850Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009851done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852
9853static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009854unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009856 Py_ssize_t marg, left;
9857 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 Py_UCS4 fillchar = ' ';
9859
Victor Stinnere9a29352011-10-01 02:14:59 +02009860 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862
Victor Stinnere9a29352011-10-01 02:14:59 +02009863 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864 return NULL;
9865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867 Py_INCREF(self);
9868 return (PyObject*) self;
9869 }
9870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872 left = marg / 2 + (marg & width & 1);
9873
Victor Stinner9310abb2011-10-05 00:59:23 +02009874 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875}
9876
Marc-André Lemburge5034372000-08-08 08:04:29 +00009877#if 0
9878
9879/* This code should go into some future Unicode collation support
9880 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009881 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009882
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009883/* speedy UTF-16 code point order comparison */
9884/* gleaned from: */
9885/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9886
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009887static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009888{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009889 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009890 0, 0, 0, 0, 0, 0, 0, 0,
9891 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009892 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009893};
9894
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895static int
9896unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9897{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009898 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009899
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900 Py_UNICODE *s1 = str1->str;
9901 Py_UNICODE *s2 = str2->str;
9902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 len1 = str1->_base._base.length;
9904 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009905
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009907 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009908
9909 c1 = *s1++;
9910 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009911
Benjamin Peterson29060642009-01-31 22:14:21 +00009912 if (c1 > (1<<11) * 26)
9913 c1 += utf16Fixup[c1>>11];
9914 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009915 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009916 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009917
9918 if (c1 != c2)
9919 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009920
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009921 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922 }
9923
9924 return (len1 < len2) ? -1 : (len1 != len2);
9925}
9926
Marc-André Lemburge5034372000-08-08 08:04:29 +00009927#else
9928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929/* This function assumes that str1 and str2 are readied by the caller. */
9930
Marc-André Lemburge5034372000-08-08 08:04:29 +00009931static int
9932unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 int kind1, kind2;
9935 void *data1, *data2;
9936 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 kind1 = PyUnicode_KIND(str1);
9939 kind2 = PyUnicode_KIND(str2);
9940 data1 = PyUnicode_DATA(str1);
9941 data2 = PyUnicode_DATA(str2);
9942 len1 = PyUnicode_GET_LENGTH(str1);
9943 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 for (i = 0; i < len1 && i < len2; ++i) {
9946 Py_UCS4 c1, c2;
9947 c1 = PyUnicode_READ(kind1, data1, i);
9948 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009949
9950 if (c1 != c2)
9951 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009952 }
9953
9954 return (len1 < len2) ? -1 : (len1 != len2);
9955}
9956
9957#endif
9958
Alexander Belopolsky40018472011-02-26 01:02:56 +00009959int
9960PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9963 if (PyUnicode_READY(left) == -1 ||
9964 PyUnicode_READY(right) == -1)
9965 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009966 return unicode_compare((PyUnicodeObject *)left,
9967 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009969 PyErr_Format(PyExc_TypeError,
9970 "Can't compare %.100s and %.100s",
9971 left->ob_type->tp_name,
9972 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973 return -1;
9974}
9975
Martin v. Löwis5b222132007-06-10 09:51:05 +00009976int
9977PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 Py_ssize_t i;
9980 int kind;
9981 void *data;
9982 Py_UCS4 chr;
9983
Victor Stinner910337b2011-10-03 03:20:16 +02009984 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 if (PyUnicode_READY(uni) == -1)
9986 return -1;
9987 kind = PyUnicode_KIND(uni);
9988 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009989 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9991 if (chr != str[i])
9992 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009993 /* This check keeps Python strings that end in '\0' from comparing equal
9994 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009996 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009997 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009999 return 0;
10000}
10001
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010002
Benjamin Peterson29060642009-01-31 22:14:21 +000010003#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010004 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010005
Alexander Belopolsky40018472011-02-26 01:02:56 +000010006PyObject *
10007PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010008{
10009 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010011 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10012 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 if (PyUnicode_READY(left) == -1 ||
10014 PyUnicode_READY(right) == -1)
10015 return NULL;
10016 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10017 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010018 if (op == Py_EQ) {
10019 Py_INCREF(Py_False);
10020 return Py_False;
10021 }
10022 if (op == Py_NE) {
10023 Py_INCREF(Py_True);
10024 return Py_True;
10025 }
10026 }
10027 if (left == right)
10028 result = 0;
10029 else
10030 result = unicode_compare((PyUnicodeObject *)left,
10031 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010032
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010033 /* Convert the return value to a Boolean */
10034 switch (op) {
10035 case Py_EQ:
10036 v = TEST_COND(result == 0);
10037 break;
10038 case Py_NE:
10039 v = TEST_COND(result != 0);
10040 break;
10041 case Py_LE:
10042 v = TEST_COND(result <= 0);
10043 break;
10044 case Py_GE:
10045 v = TEST_COND(result >= 0);
10046 break;
10047 case Py_LT:
10048 v = TEST_COND(result == -1);
10049 break;
10050 case Py_GT:
10051 v = TEST_COND(result == 1);
10052 break;
10053 default:
10054 PyErr_BadArgument();
10055 return NULL;
10056 }
10057 Py_INCREF(v);
10058 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010059 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010060
Brian Curtindfc80e32011-08-10 20:28:54 -050010061 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010062}
10063
Alexander Belopolsky40018472011-02-26 01:02:56 +000010064int
10065PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010066{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010067 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 int kind1, kind2, kind;
10069 void *buf1, *buf2;
10070 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010071 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010072
10073 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 sub = PyUnicode_FromObject(element);
10075 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 PyErr_Format(PyExc_TypeError,
10077 "'in <string>' requires string as left operand, not %s",
10078 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010079 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 if (PyUnicode_READY(sub) == -1)
10082 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010083
Thomas Wouters477c8d52006-05-27 19:21:47 +000010084 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010085 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010086 Py_DECREF(sub);
10087 return -1;
10088 }
10089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 kind1 = PyUnicode_KIND(str);
10091 kind2 = PyUnicode_KIND(sub);
10092 kind = kind1 > kind2 ? kind1 : kind2;
10093 buf1 = PyUnicode_DATA(str);
10094 buf2 = PyUnicode_DATA(sub);
10095 if (kind1 != kind)
10096 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10097 if (!buf1) {
10098 Py_DECREF(sub);
10099 return -1;
10100 }
10101 if (kind2 != kind)
10102 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10103 if (!buf2) {
10104 Py_DECREF(sub);
10105 if (kind1 != kind) PyMem_Free(buf1);
10106 return -1;
10107 }
10108 len1 = PyUnicode_GET_LENGTH(str);
10109 len2 = PyUnicode_GET_LENGTH(sub);
10110
10111 switch(kind) {
10112 case PyUnicode_1BYTE_KIND:
10113 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10114 break;
10115 case PyUnicode_2BYTE_KIND:
10116 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10117 break;
10118 case PyUnicode_4BYTE_KIND:
10119 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10120 break;
10121 default:
10122 result = -1;
10123 assert(0);
10124 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010125
10126 Py_DECREF(str);
10127 Py_DECREF(sub);
10128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 if (kind1 != kind)
10130 PyMem_Free(buf1);
10131 if (kind2 != kind)
10132 PyMem_Free(buf2);
10133
Guido van Rossum403d68b2000-03-13 15:55:09 +000010134 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010135}
10136
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137/* Concat to string or Unicode object giving a new Unicode object. */
10138
Alexander Belopolsky40018472011-02-26 01:02:56 +000010139PyObject *
10140PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 PyObject *u = NULL, *v = NULL, *w;
10143 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144
10145 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010148 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010151 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152
10153 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010154 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010155 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010158 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010159 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161 }
10162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010164 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 w = PyUnicode_New(
10168 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10169 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010171 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010172 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10173 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174 Py_DECREF(u);
10175 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010176 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178
Benjamin Peterson29060642009-01-31 22:14:21 +000010179 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180 Py_XDECREF(u);
10181 Py_XDECREF(v);
10182 return NULL;
10183}
10184
Victor Stinnerb0923652011-10-04 01:17:31 +020010185static void
10186unicode_append_inplace(PyObject **p_left, PyObject *right)
10187{
10188 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010189
10190 assert(PyUnicode_IS_READY(*p_left));
10191 assert(PyUnicode_IS_READY(right));
10192
10193 left_len = PyUnicode_GET_LENGTH(*p_left);
10194 right_len = PyUnicode_GET_LENGTH(right);
10195 if (left_len > PY_SSIZE_T_MAX - right_len) {
10196 PyErr_SetString(PyExc_OverflowError,
10197 "strings are too large to concat");
10198 goto error;
10199 }
10200 new_len = left_len + right_len;
10201
10202 /* Now we own the last reference to 'left', so we can resize it
10203 * in-place.
10204 */
10205 if (unicode_resize(p_left, new_len) != 0) {
10206 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10207 * deallocated so it cannot be put back into
10208 * 'variable'. The MemoryError is raised when there
10209 * is no value in 'variable', which might (very
10210 * remotely) be a cause of incompatibilities.
10211 */
10212 goto error;
10213 }
10214 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010215 copy_characters(*p_left, left_len, right, 0, right_len);
10216 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010217 return;
10218
10219error:
10220 Py_DECREF(*p_left);
10221 *p_left = NULL;
10222}
10223
Walter Dörwald1ab83302007-05-18 17:15:44 +000010224void
Victor Stinner23e56682011-10-03 03:54:37 +020010225PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010226{
Victor Stinner23e56682011-10-03 03:54:37 +020010227 PyObject *left, *res;
10228
10229 if (p_left == NULL) {
10230 if (!PyErr_Occurred())
10231 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010232 return;
10233 }
Victor Stinner23e56682011-10-03 03:54:37 +020010234 left = *p_left;
10235 if (right == NULL || !PyUnicode_Check(left)) {
10236 if (!PyErr_Occurred())
10237 PyErr_BadInternalCall();
10238 goto error;
10239 }
10240
Victor Stinnere1335c72011-10-04 20:53:03 +020010241 if (PyUnicode_READY(left))
10242 goto error;
10243 if (PyUnicode_READY(right))
10244 goto error;
10245
Victor Stinner23e56682011-10-03 03:54:37 +020010246 if (PyUnicode_CheckExact(left) && left != unicode_empty
10247 && PyUnicode_CheckExact(right) && right != unicode_empty
10248 && unicode_resizable(left)
10249 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10250 || _PyUnicode_WSTR(left) != NULL))
10251 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010252 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10253 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010254 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010255 not so different than duplicating the string. */
10256 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010257 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010258 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010259 if (p_left != NULL)
10260 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010261 return;
10262 }
10263 }
10264
10265 res = PyUnicode_Concat(left, right);
10266 if (res == NULL)
10267 goto error;
10268 Py_DECREF(left);
10269 *p_left = res;
10270 return;
10271
10272error:
10273 Py_DECREF(*p_left);
10274 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010275}
10276
10277void
10278PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10279{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010280 PyUnicode_Append(pleft, right);
10281 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010282}
10283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010284PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010285 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010287Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010288string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010289interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290
10291static PyObject *
10292unicode_count(PyUnicodeObject *self, PyObject *args)
10293{
10294 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010295 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010296 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 int kind1, kind2, kind;
10299 void *buf1, *buf2;
10300 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301
Jesus Ceaac451502011-04-20 17:09:23 +020010302 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10303 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010304 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 kind1 = PyUnicode_KIND(self);
10307 kind2 = PyUnicode_KIND(substring);
10308 kind = kind1 > kind2 ? kind1 : kind2;
10309 buf1 = PyUnicode_DATA(self);
10310 buf2 = PyUnicode_DATA(substring);
10311 if (kind1 != kind)
10312 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10313 if (!buf1) {
10314 Py_DECREF(substring);
10315 return NULL;
10316 }
10317 if (kind2 != kind)
10318 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10319 if (!buf2) {
10320 Py_DECREF(substring);
10321 if (kind1 != kind) PyMem_Free(buf1);
10322 return NULL;
10323 }
10324 len1 = PyUnicode_GET_LENGTH(self);
10325 len2 = PyUnicode_GET_LENGTH(substring);
10326
10327 ADJUST_INDICES(start, end, len1);
10328 switch(kind) {
10329 case PyUnicode_1BYTE_KIND:
10330 iresult = ucs1lib_count(
10331 ((Py_UCS1*)buf1) + start, end - start,
10332 buf2, len2, PY_SSIZE_T_MAX
10333 );
10334 break;
10335 case PyUnicode_2BYTE_KIND:
10336 iresult = ucs2lib_count(
10337 ((Py_UCS2*)buf1) + start, end - start,
10338 buf2, len2, PY_SSIZE_T_MAX
10339 );
10340 break;
10341 case PyUnicode_4BYTE_KIND:
10342 iresult = ucs4lib_count(
10343 ((Py_UCS4*)buf1) + start, end - start,
10344 buf2, len2, PY_SSIZE_T_MAX
10345 );
10346 break;
10347 default:
10348 assert(0); iresult = 0;
10349 }
10350
10351 result = PyLong_FromSsize_t(iresult);
10352
10353 if (kind1 != kind)
10354 PyMem_Free(buf1);
10355 if (kind2 != kind)
10356 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357
10358 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010359
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360 return result;
10361}
10362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010363PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010364 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010366Encode S using the codec registered for encoding. Default encoding\n\
10367is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010368handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010369a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10370'xmlcharrefreplace' as well as any other name registered with\n\
10371codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372
10373static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010374unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010376 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377 char *encoding = NULL;
10378 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010379
Benjamin Peterson308d6372009-09-18 21:42:35 +000010380 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10381 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010383 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010384}
10385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010386PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010387 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388\n\
10389Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010390If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391
10392static PyObject*
10393unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10394{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010395 Py_ssize_t i, j, line_pos, src_len, incr;
10396 Py_UCS4 ch;
10397 PyObject *u;
10398 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010400 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010401 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402
10403 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405
Antoine Pitrou22425222011-10-04 19:10:51 +020010406 if (PyUnicode_READY(self) == -1)
10407 return NULL;
10408
Thomas Wouters7e474022000-07-16 12:04:32 +000010409 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010410 src_len = PyUnicode_GET_LENGTH(self);
10411 i = j = line_pos = 0;
10412 kind = PyUnicode_KIND(self);
10413 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010414 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010415 for (; i < src_len; i++) {
10416 ch = PyUnicode_READ(kind, src_data, i);
10417 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010418 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010419 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010420 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010422 goto overflow;
10423 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010425 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010429 goto overflow;
10430 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010432 if (ch == '\n' || ch == '\r')
10433 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010435 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010436 if (!found && PyUnicode_CheckExact(self)) {
10437 Py_INCREF((PyObject *) self);
10438 return (PyObject *) self;
10439 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010440
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010442 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443 if (!u)
10444 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010445 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446
Antoine Pitroue71d5742011-10-04 15:55:09 +020010447 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448
Antoine Pitroue71d5742011-10-04 15:55:09 +020010449 for (; i < src_len; i++) {
10450 ch = PyUnicode_READ(kind, src_data, i);
10451 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010452 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010453 incr = tabsize - (line_pos % tabsize);
10454 line_pos += incr;
10455 while (incr--) {
10456 PyUnicode_WRITE(kind, dest_data, j, ' ');
10457 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010458 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010460 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010462 line_pos++;
10463 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010464 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010465 if (ch == '\n' || ch == '\r')
10466 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010468 }
10469 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010470#ifndef DONT_MAKE_RESULT_READY
10471 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 Py_DECREF(u);
10473 return NULL;
10474 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010475#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010476 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010478
Antoine Pitroue71d5742011-10-04 15:55:09 +020010479 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010480 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10481 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482}
10483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010484PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010485 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486\n\
10487Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010488such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489arguments start and end are interpreted as in slice notation.\n\
10490\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010491Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
10493static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495{
Jesus Ceaac451502011-04-20 17:09:23 +020010496 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010497 Py_ssize_t start;
10498 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010499 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500
Jesus Ceaac451502011-04-20 17:09:23 +020010501 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10502 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 if (PyUnicode_READY(self) == -1)
10506 return NULL;
10507 if (PyUnicode_READY(substring) == -1)
10508 return NULL;
10509
10510 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010511 asciilib_find_slice, ucs1lib_find_slice,
10512 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010514 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515
10516 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (result == -2)
10519 return NULL;
10520
Christian Heimes217cfd12007-12-02 14:31:20 +000010521 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522}
10523
10524static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010525unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010527 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10528 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531}
10532
Guido van Rossumc2504932007-09-18 19:42:40 +000010533/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010534 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010535static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010536unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537{
Guido van Rossumc2504932007-09-18 19:42:40 +000010538 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010539 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 if (_PyUnicode_HASH(self) != -1)
10542 return _PyUnicode_HASH(self);
10543 if (PyUnicode_READY(self) == -1)
10544 return -1;
10545 len = PyUnicode_GET_LENGTH(self);
10546
10547 /* The hash function as a macro, gets expanded three times below. */
10548#define HASH(P) \
10549 x = (Py_uhash_t)*P << 7; \
10550 while (--len >= 0) \
10551 x = (1000003*x) ^ (Py_uhash_t)*P++;
10552
10553 switch (PyUnicode_KIND(self)) {
10554 case PyUnicode_1BYTE_KIND: {
10555 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10556 HASH(c);
10557 break;
10558 }
10559 case PyUnicode_2BYTE_KIND: {
10560 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10561 HASH(s);
10562 break;
10563 }
10564 default: {
10565 Py_UCS4 *l;
10566 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10567 "Impossible switch case in unicode_hash");
10568 l = PyUnicode_4BYTE_DATA(self);
10569 HASH(l);
10570 break;
10571 }
10572 }
10573 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10574
Guido van Rossumc2504932007-09-18 19:42:40 +000010575 if (x == -1)
10576 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010578 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010582PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010583 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010585Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586
10587static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010590 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010591 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010592 Py_ssize_t start;
10593 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594
Jesus Ceaac451502011-04-20 17:09:23 +020010595 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10596 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 if (PyUnicode_READY(self) == -1)
10600 return NULL;
10601 if (PyUnicode_READY(substring) == -1)
10602 return NULL;
10603
10604 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010605 asciilib_find_slice, ucs1lib_find_slice,
10606 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609
10610 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (result == -2)
10613 return NULL;
10614
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615 if (result < 0) {
10616 PyErr_SetString(PyExc_ValueError, "substring not found");
10617 return NULL;
10618 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010619
Christian Heimes217cfd12007-12-02 14:31:20 +000010620 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621}
10622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010623PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010626Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010627at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628
10629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010630unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 Py_ssize_t i, length;
10633 int kind;
10634 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635 int cased;
10636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 if (PyUnicode_READY(self) == -1)
10638 return NULL;
10639 length = PyUnicode_GET_LENGTH(self);
10640 kind = PyUnicode_KIND(self);
10641 data = PyUnicode_DATA(self);
10642
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 if (length == 1)
10645 return PyBool_FromLong(
10646 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010648 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010651
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 for (i = 0; i < length; i++) {
10654 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010655
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10657 return PyBool_FromLong(0);
10658 else if (!cased && Py_UNICODE_ISLOWER(ch))
10659 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010661 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662}
10663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010664PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010667Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010668at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669
10670static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010671unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 Py_ssize_t i, length;
10674 int kind;
10675 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676 int cased;
10677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (PyUnicode_READY(self) == -1)
10679 return NULL;
10680 length = PyUnicode_GET_LENGTH(self);
10681 kind = PyUnicode_KIND(self);
10682 data = PyUnicode_DATA(self);
10683
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (length == 1)
10686 return PyBool_FromLong(
10687 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010689 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010691 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010692
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 for (i = 0; i < length; i++) {
10695 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010696
Benjamin Peterson29060642009-01-31 22:14:21 +000010697 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10698 return PyBool_FromLong(0);
10699 else if (!cased && Py_UNICODE_ISUPPER(ch))
10700 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010702 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703}
10704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010705PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010706 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010708Return True if S is a titlecased string and there is at least one\n\
10709character in S, i.e. upper- and titlecase characters may only\n\
10710follow uncased characters and lowercase characters only cased ones.\n\
10711Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
10713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010714unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 Py_ssize_t i, length;
10717 int kind;
10718 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 int cased, previous_is_cased;
10720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (PyUnicode_READY(self) == -1)
10722 return NULL;
10723 length = PyUnicode_GET_LENGTH(self);
10724 kind = PyUnicode_KIND(self);
10725 data = PyUnicode_DATA(self);
10726
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (length == 1) {
10729 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10730 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10731 (Py_UNICODE_ISUPPER(ch) != 0));
10732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010734 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010737
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738 cased = 0;
10739 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 for (i = 0; i < length; i++) {
10741 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010742
Benjamin Peterson29060642009-01-31 22:14:21 +000010743 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10744 if (previous_is_cased)
10745 return PyBool_FromLong(0);
10746 previous_is_cased = 1;
10747 cased = 1;
10748 }
10749 else if (Py_UNICODE_ISLOWER(ch)) {
10750 if (!previous_is_cased)
10751 return PyBool_FromLong(0);
10752 previous_is_cased = 1;
10753 cased = 1;
10754 }
10755 else
10756 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010758 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759}
10760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010761PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010764Return True if all characters in S are whitespace\n\
10765and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
10767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010768unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 Py_ssize_t i, length;
10771 int kind;
10772 void *data;
10773
10774 if (PyUnicode_READY(self) == -1)
10775 return NULL;
10776 length = PyUnicode_GET_LENGTH(self);
10777 kind = PyUnicode_KIND(self);
10778 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 if (length == 1)
10782 return PyBool_FromLong(
10783 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010785 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010787 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 for (i = 0; i < length; i++) {
10790 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010791 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010794 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795}
10796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010797PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010798 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010799\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010800Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010801and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010802
10803static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010804unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 Py_ssize_t i, length;
10807 int kind;
10808 void *data;
10809
10810 if (PyUnicode_READY(self) == -1)
10811 return NULL;
10812 length = PyUnicode_GET_LENGTH(self);
10813 kind = PyUnicode_KIND(self);
10814 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010815
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010816 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (length == 1)
10818 return PyBool_FromLong(
10819 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010820
10821 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 for (i = 0; i < length; i++) {
10826 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010829 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010830}
10831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010832PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010833 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010835Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010836and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010837
10838static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010839unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 int kind;
10842 void *data;
10843 Py_ssize_t len, i;
10844
10845 if (PyUnicode_READY(self) == -1)
10846 return NULL;
10847
10848 kind = PyUnicode_KIND(self);
10849 data = PyUnicode_DATA(self);
10850 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010851
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010852 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 if (len == 1) {
10854 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10855 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10856 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010857
10858 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010860 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 for (i = 0; i < len; i++) {
10863 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010864 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010866 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010867 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010868}
10869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010870PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010873Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010874False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875
10876static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010877unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 Py_ssize_t i, length;
10880 int kind;
10881 void *data;
10882
10883 if (PyUnicode_READY(self) == -1)
10884 return NULL;
10885 length = PyUnicode_GET_LENGTH(self);
10886 kind = PyUnicode_KIND(self);
10887 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (length == 1)
10891 return PyBool_FromLong(
10892 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010894 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010896 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 for (i = 0; i < length; i++) {
10899 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903}
10904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010905PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010908Return True if all characters in S are digits\n\
10909and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910
10911static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010912unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 Py_ssize_t i, length;
10915 int kind;
10916 void *data;
10917
10918 if (PyUnicode_READY(self) == -1)
10919 return NULL;
10920 length = PyUnicode_GET_LENGTH(self);
10921 kind = PyUnicode_KIND(self);
10922 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (length == 1) {
10926 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10927 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010930 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 for (i = 0; i < length; i++) {
10935 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010936 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939}
10940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010944Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010945False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946
10947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010948unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 Py_ssize_t i, length;
10951 int kind;
10952 void *data;
10953
10954 if (PyUnicode_READY(self) == -1)
10955 return NULL;
10956 length = PyUnicode_GET_LENGTH(self);
10957 kind = PyUnicode_KIND(self);
10958 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 if (length == 1)
10962 return PyBool_FromLong(
10963 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010965 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010967 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 for (i = 0; i < length; i++) {
10970 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010973 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974}
10975
Martin v. Löwis47383402007-08-15 07:32:56 +000010976int
10977PyUnicode_IsIdentifier(PyObject *self)
10978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 int kind;
10980 void *data;
10981 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010982 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 if (PyUnicode_READY(self) == -1) {
10985 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 }
10988
10989 /* Special case for empty strings */
10990 if (PyUnicode_GET_LENGTH(self) == 0)
10991 return 0;
10992 kind = PyUnicode_KIND(self);
10993 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010994
10995 /* PEP 3131 says that the first character must be in
10996 XID_Start and subsequent characters in XID_Continue,
10997 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010998 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010999 letters, digits, underscore). However, given the current
11000 definition of XID_Start and XID_Continue, it is sufficient
11001 to check just for these, except that _ must be allowed
11002 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011004 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011005 return 0;
11006
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011007 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011010 return 1;
11011}
11012
11013PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011015\n\
11016Return True if S is a valid identifier according\n\
11017to the language definition.");
11018
11019static PyObject*
11020unicode_isidentifier(PyObject *self)
11021{
11022 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11023}
11024
Georg Brandl559e5d72008-06-11 18:37:52 +000011025PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011027\n\
11028Return True if all characters in S are considered\n\
11029printable in repr() or S is empty, False otherwise.");
11030
11031static PyObject*
11032unicode_isprintable(PyObject *self)
11033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 Py_ssize_t i, length;
11035 int kind;
11036 void *data;
11037
11038 if (PyUnicode_READY(self) == -1)
11039 return NULL;
11040 length = PyUnicode_GET_LENGTH(self);
11041 kind = PyUnicode_KIND(self);
11042 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011043
11044 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 if (length == 1)
11046 return PyBool_FromLong(
11047 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 for (i = 0; i < length; i++) {
11050 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011051 Py_RETURN_FALSE;
11052 }
11053 }
11054 Py_RETURN_TRUE;
11055}
11056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011057PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011058 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059\n\
11060Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011061iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062
11063static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011064unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011066 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067}
11068
Martin v. Löwis18e16552006-02-15 17:27:45 +000011069static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070unicode_length(PyUnicodeObject *self)
11071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 if (PyUnicode_READY(self) == -1)
11073 return -1;
11074 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075}
11076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011077PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011078 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011080Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011081done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082
11083static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011084unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011086 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 Py_UCS4 fillchar = ' ';
11088
11089 if (PyUnicode_READY(self) == -1)
11090 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011092 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093 return NULL;
11094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 Py_INCREF(self);
11097 return (PyObject*) self;
11098 }
11099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101}
11102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011103PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011104 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011106Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107
11108static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011109unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111 return fixup(self, fixlower);
11112}
11113
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011114#define LEFTSTRIP 0
11115#define RIGHTSTRIP 1
11116#define BOTHSTRIP 2
11117
11118/* Arrays indexed by above */
11119static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11120
11121#define STRIPNAME(i) (stripformat[i]+3)
11122
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011123/* externally visible for str.strip(unicode) */
11124PyObject *
11125_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 void *data;
11128 int kind;
11129 Py_ssize_t i, j, len;
11130 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11133 return NULL;
11134
11135 kind = PyUnicode_KIND(self);
11136 data = PyUnicode_DATA(self);
11137 len = PyUnicode_GET_LENGTH(self);
11138 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11139 PyUnicode_DATA(sepobj),
11140 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011141
Benjamin Peterson14339b62009-01-31 16:36:08 +000011142 i = 0;
11143 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 while (i < len &&
11145 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 i++;
11147 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011148 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011149
Benjamin Peterson14339b62009-01-31 16:36:08 +000011150 j = len;
11151 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 do {
11153 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 } while (j >= i &&
11155 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011157 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011158
Victor Stinner12bab6d2011-10-01 01:53:49 +020011159 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160}
11161
11162PyObject*
11163PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11164{
11165 unsigned char *data;
11166 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011167 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168
Victor Stinnerde636f32011-10-01 03:55:54 +020011169 if (PyUnicode_READY(self) == -1)
11170 return NULL;
11171
11172 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11173
Victor Stinner12bab6d2011-10-01 01:53:49 +020011174 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011176 if (PyUnicode_CheckExact(self)) {
11177 Py_INCREF(self);
11178 return self;
11179 }
11180 else
11181 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 }
11183
Victor Stinner12bab6d2011-10-01 01:53:49 +020011184 length = end - start;
11185 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011186 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187
Victor Stinnerde636f32011-10-01 03:55:54 +020011188 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011189 PyErr_SetString(PyExc_IndexError, "string index out of range");
11190 return NULL;
11191 }
11192
Victor Stinnerb9275c12011-10-05 14:01:42 +020011193 if (PyUnicode_IS_ASCII(self)) {
11194 kind = PyUnicode_KIND(self);
11195 data = PyUnicode_1BYTE_DATA(self);
11196 return unicode_fromascii(data + start, length);
11197 }
11198 else {
11199 kind = PyUnicode_KIND(self);
11200 data = PyUnicode_1BYTE_DATA(self);
11201 return PyUnicode_FromKindAndData(kind,
11202 data + PyUnicode_KIND_SIZE(kind, start),
11203 length);
11204 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
11207static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011208do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 int kind;
11211 void *data;
11212 Py_ssize_t len, i, j;
11213
11214 if (PyUnicode_READY(self) == -1)
11215 return NULL;
11216
11217 kind = PyUnicode_KIND(self);
11218 data = PyUnicode_DATA(self);
11219 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011220
Benjamin Peterson14339b62009-01-31 16:36:08 +000011221 i = 0;
11222 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011224 i++;
11225 }
11226 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011227
Benjamin Peterson14339b62009-01-31 16:36:08 +000011228 j = len;
11229 if (striptype != LEFTSTRIP) {
11230 do {
11231 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011233 j++;
11234 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011235
Victor Stinner12bab6d2011-10-01 01:53:49 +020011236 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237}
11238
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011239
11240static PyObject *
11241do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11242{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011243 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011244
Benjamin Peterson14339b62009-01-31 16:36:08 +000011245 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11246 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011247
Benjamin Peterson14339b62009-01-31 16:36:08 +000011248 if (sep != NULL && sep != Py_None) {
11249 if (PyUnicode_Check(sep))
11250 return _PyUnicode_XStrip(self, striptype, sep);
11251 else {
11252 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 "%s arg must be None or str",
11254 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011255 return NULL;
11256 }
11257 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011258
Benjamin Peterson14339b62009-01-31 16:36:08 +000011259 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011260}
11261
11262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011263PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011265\n\
11266Return a copy of the string S with leading and trailing\n\
11267whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011268If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011269
11270static PyObject *
11271unicode_strip(PyUnicodeObject *self, PyObject *args)
11272{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011273 if (PyTuple_GET_SIZE(args) == 0)
11274 return do_strip(self, BOTHSTRIP); /* Common case */
11275 else
11276 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011277}
11278
11279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011280PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011282\n\
11283Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011284If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011285
11286static PyObject *
11287unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11288{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011289 if (PyTuple_GET_SIZE(args) == 0)
11290 return do_strip(self, LEFTSTRIP); /* Common case */
11291 else
11292 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011293}
11294
11295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011296PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011298\n\
11299Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011300If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011301
11302static PyObject *
11303unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11304{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011305 if (PyTuple_GET_SIZE(args) == 0)
11306 return do_strip(self, RIGHTSTRIP); /* Common case */
11307 else
11308 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011309}
11310
11311
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011313unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314{
11315 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317
Georg Brandl222de0f2009-04-12 12:01:50 +000011318 if (len < 1) {
11319 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011320 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
Tim Peters7a29bd52001-09-12 03:03:31 +000011323 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 /* no repeat, return original string */
11325 Py_INCREF(str);
11326 return (PyObject*) str;
11327 }
Tim Peters8f422462000-09-09 06:13:41 +000011328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 if (PyUnicode_READY(str) == -1)
11330 return NULL;
11331
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011332 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011333 PyErr_SetString(PyExc_OverflowError,
11334 "repeated string is too long");
11335 return NULL;
11336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340 if (!u)
11341 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011342 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 if (PyUnicode_GET_LENGTH(str) == 1) {
11345 const int kind = PyUnicode_KIND(str);
11346 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11347 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011348 if (kind == PyUnicode_1BYTE_KIND)
11349 memset(to, (unsigned char)fill_char, len);
11350 else {
11351 for (n = 0; n < len; ++n)
11352 PyUnicode_WRITE(kind, to, n, fill_char);
11353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 }
11355 else {
11356 /* number of characters copied this far */
11357 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11358 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11359 char *to = (char *) PyUnicode_DATA(u);
11360 Py_MEMCPY(to, PyUnicode_DATA(str),
11361 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 n = (done <= nchars-done) ? done : nchars-done;
11364 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011365 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367 }
11368
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011369 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 return (PyObject*) u;
11371}
11372
Alexander Belopolsky40018472011-02-26 01:02:56 +000011373PyObject *
11374PyUnicode_Replace(PyObject *obj,
11375 PyObject *subobj,
11376 PyObject *replobj,
11377 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378{
11379 PyObject *self;
11380 PyObject *str1;
11381 PyObject *str2;
11382 PyObject *result;
11383
11384 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011385 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011388 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 Py_DECREF(self);
11390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391 }
11392 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011393 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 Py_DECREF(self);
11395 Py_DECREF(str1);
11396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399 Py_DECREF(self);
11400 Py_DECREF(str1);
11401 Py_DECREF(str2);
11402 return result;
11403}
11404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011405PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011406 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407\n\
11408Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011409old replaced by new. If the optional argument count is\n\
11410given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
11412static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 PyObject *str1;
11416 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011417 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 PyObject *result;
11419
Martin v. Löwis18e16552006-02-15 17:27:45 +000011420 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 str1 = PyUnicode_FromObject(str1);
11425 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11426 return NULL;
11427 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011428 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 Py_DECREF(str1);
11430 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432
11433 result = replace(self, str1, str2, maxcount);
11434
11435 Py_DECREF(str1);
11436 Py_DECREF(str2);
11437 return result;
11438}
11439
Alexander Belopolsky40018472011-02-26 01:02:56 +000011440static PyObject *
11441unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011443 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 Py_ssize_t isize;
11445 Py_ssize_t osize, squote, dquote, i, o;
11446 Py_UCS4 max, quote;
11447 int ikind, okind;
11448 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011451 return NULL;
11452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 isize = PyUnicode_GET_LENGTH(unicode);
11454 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 /* Compute length of output, quote characters, and
11457 maximum character */
11458 osize = 2; /* quotes */
11459 max = 127;
11460 squote = dquote = 0;
11461 ikind = PyUnicode_KIND(unicode);
11462 for (i = 0; i < isize; i++) {
11463 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11464 switch (ch) {
11465 case '\'': squote++; osize++; break;
11466 case '"': dquote++; osize++; break;
11467 case '\\': case '\t': case '\r': case '\n':
11468 osize += 2; break;
11469 default:
11470 /* Fast-path ASCII */
11471 if (ch < ' ' || ch == 0x7f)
11472 osize += 4; /* \xHH */
11473 else if (ch < 0x7f)
11474 osize++;
11475 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11476 osize++;
11477 max = ch > max ? ch : max;
11478 }
11479 else if (ch < 0x100)
11480 osize += 4; /* \xHH */
11481 else if (ch < 0x10000)
11482 osize += 6; /* \uHHHH */
11483 else
11484 osize += 10; /* \uHHHHHHHH */
11485 }
11486 }
11487
11488 quote = '\'';
11489 if (squote) {
11490 if (dquote)
11491 /* Both squote and dquote present. Use squote,
11492 and escape them */
11493 osize += squote;
11494 else
11495 quote = '"';
11496 }
11497
11498 repr = PyUnicode_New(osize, max);
11499 if (repr == NULL)
11500 return NULL;
11501 okind = PyUnicode_KIND(repr);
11502 odata = PyUnicode_DATA(repr);
11503
11504 PyUnicode_WRITE(okind, odata, 0, quote);
11505 PyUnicode_WRITE(okind, odata, osize-1, quote);
11506
11507 for (i = 0, o = 1; i < isize; i++) {
11508 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011509
11510 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if ((ch == quote) || (ch == '\\')) {
11512 PyUnicode_WRITE(okind, odata, o++, '\\');
11513 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011514 continue;
11515 }
11516
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011518 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 PyUnicode_WRITE(okind, odata, o++, '\\');
11520 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011521 }
11522 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 PyUnicode_WRITE(okind, odata, o++, '\\');
11524 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011525 }
11526 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 PyUnicode_WRITE(okind, odata, o++, '\\');
11528 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011529 }
11530
11531 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011532 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 PyUnicode_WRITE(okind, odata, o++, '\\');
11534 PyUnicode_WRITE(okind, odata, o++, 'x');
11535 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11536 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011537 }
11538
Georg Brandl559e5d72008-06-11 18:37:52 +000011539 /* Copy ASCII characters as-is */
11540 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011542 }
11543
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011545 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011546 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011547 (categories Z* and C* except ASCII space)
11548 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011550 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (ch <= 0xff) {
11552 PyUnicode_WRITE(okind, odata, o++, '\\');
11553 PyUnicode_WRITE(okind, odata, o++, 'x');
11554 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11555 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011556 }
11557 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 else if (ch >= 0x10000) {
11559 PyUnicode_WRITE(okind, odata, o++, '\\');
11560 PyUnicode_WRITE(okind, odata, o++, 'U');
11561 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11562 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11563 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11564 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11565 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11566 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11567 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11568 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011569 }
11570 /* Map 16-bit characters to '\uxxxx' */
11571 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 PyUnicode_WRITE(okind, odata, o++, '\\');
11573 PyUnicode_WRITE(okind, odata, o++, 'u');
11574 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11575 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11576 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11577 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011578 }
11579 }
11580 /* Copy characters as-is */
11581 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011583 }
11584 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011587 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011588 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589}
11590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011591PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011592 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593\n\
11594Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011595such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596arguments start and end are interpreted as in slice notation.\n\
11597\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011598Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
11600static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602{
Jesus Ceaac451502011-04-20 17:09:23 +020011603 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011604 Py_ssize_t start;
11605 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011606 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607
Jesus Ceaac451502011-04-20 17:09:23 +020011608 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11609 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 if (PyUnicode_READY(self) == -1)
11613 return NULL;
11614 if (PyUnicode_READY(substring) == -1)
11615 return NULL;
11616
11617 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011618 asciilib_rfind_slice, ucs1lib_rfind_slice,
11619 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
11623 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (result == -2)
11626 return NULL;
11627
Christian Heimes217cfd12007-12-02 14:31:20 +000011628 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629}
11630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011631PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011634Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
11636static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638{
Jesus Ceaac451502011-04-20 17:09:23 +020011639 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011640 Py_ssize_t start;
11641 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011642 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
Jesus Ceaac451502011-04-20 17:09:23 +020011644 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11645 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 if (PyUnicode_READY(self) == -1)
11649 return NULL;
11650 if (PyUnicode_READY(substring) == -1)
11651 return NULL;
11652
11653 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011654 asciilib_rfind_slice, ucs1lib_rfind_slice,
11655 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011657 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
11659 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (result == -2)
11662 return NULL;
11663
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664 if (result < 0) {
11665 PyErr_SetString(PyExc_ValueError, "substring not found");
11666 return NULL;
11667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668
Christian Heimes217cfd12007-12-02 14:31:20 +000011669 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670}
11671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011672PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011673 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011675Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011676done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
11678static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011679unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011681 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 Py_UCS4 fillchar = ' ';
11683
Victor Stinnere9a29352011-10-01 02:14:59 +020011684 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011686
Victor Stinnere9a29352011-10-01 02:14:59 +020011687 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 return NULL;
11689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 Py_INCREF(self);
11692 return (PyObject*) self;
11693 }
11694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696}
11697
Alexander Belopolsky40018472011-02-26 01:02:56 +000011698PyObject *
11699PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700{
11701 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011702
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703 s = PyUnicode_FromObject(s);
11704 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011705 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 if (sep != NULL) {
11707 sep = PyUnicode_FromObject(sep);
11708 if (sep == NULL) {
11709 Py_DECREF(s);
11710 return NULL;
11711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 }
11713
Victor Stinner9310abb2011-10-05 00:59:23 +020011714 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715
11716 Py_DECREF(s);
11717 Py_XDECREF(sep);
11718 return result;
11719}
11720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011721PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723\n\
11724Return a list of the words in S, using sep as the\n\
11725delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011726splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011727whitespace string is a separator and empty strings are\n\
11728removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
11730static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011731unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732{
11733 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011734 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735
Martin v. Löwis18e16552006-02-15 17:27:45 +000011736 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 return NULL;
11738
11739 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011742 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745}
11746
Thomas Wouters477c8d52006-05-27 19:21:47 +000011747PyObject *
11748PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11749{
11750 PyObject* str_obj;
11751 PyObject* sep_obj;
11752 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 int kind1, kind2, kind;
11754 void *buf1 = NULL, *buf2 = NULL;
11755 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011756
11757 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011758 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011760 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011762 Py_DECREF(str_obj);
11763 return NULL;
11764 }
11765
Victor Stinner14f8f022011-10-05 20:58:25 +020011766 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011768 kind = Py_MAX(kind1, kind2);
11769 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011771 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (!buf1)
11773 goto onError;
11774 buf2 = PyUnicode_DATA(sep_obj);
11775 if (kind2 != kind)
11776 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11777 if (!buf2)
11778 goto onError;
11779 len1 = PyUnicode_GET_LENGTH(str_obj);
11780 len2 = PyUnicode_GET_LENGTH(sep_obj);
11781
Victor Stinner14f8f022011-10-05 20:58:25 +020011782 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011784 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11785 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11786 else
11787 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 break;
11789 case PyUnicode_2BYTE_KIND:
11790 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11791 break;
11792 case PyUnicode_4BYTE_KIND:
11793 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11794 break;
11795 default:
11796 assert(0);
11797 out = 0;
11798 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011799
11800 Py_DECREF(sep_obj);
11801 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 if (kind1 != kind)
11803 PyMem_Free(buf1);
11804 if (kind2 != kind)
11805 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011806
11807 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 onError:
11809 Py_DECREF(sep_obj);
11810 Py_DECREF(str_obj);
11811 if (kind1 != kind && buf1)
11812 PyMem_Free(buf1);
11813 if (kind2 != kind && buf2)
11814 PyMem_Free(buf2);
11815 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011816}
11817
11818
11819PyObject *
11820PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11821{
11822 PyObject* str_obj;
11823 PyObject* sep_obj;
11824 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 int kind1, kind2, kind;
11826 void *buf1 = NULL, *buf2 = NULL;
11827 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011828
11829 str_obj = PyUnicode_FromObject(str_in);
11830 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011832 sep_obj = PyUnicode_FromObject(sep_in);
11833 if (!sep_obj) {
11834 Py_DECREF(str_obj);
11835 return NULL;
11836 }
11837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 kind1 = PyUnicode_KIND(str_in);
11839 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011840 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 buf1 = PyUnicode_DATA(str_in);
11842 if (kind1 != kind)
11843 buf1 = _PyUnicode_AsKind(str_in, kind);
11844 if (!buf1)
11845 goto onError;
11846 buf2 = PyUnicode_DATA(sep_obj);
11847 if (kind2 != kind)
11848 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11849 if (!buf2)
11850 goto onError;
11851 len1 = PyUnicode_GET_LENGTH(str_obj);
11852 len2 = PyUnicode_GET_LENGTH(sep_obj);
11853
11854 switch(PyUnicode_KIND(str_in)) {
11855 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011856 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11857 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11858 else
11859 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 break;
11861 case PyUnicode_2BYTE_KIND:
11862 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11863 break;
11864 case PyUnicode_4BYTE_KIND:
11865 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11866 break;
11867 default:
11868 assert(0);
11869 out = 0;
11870 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011871
11872 Py_DECREF(sep_obj);
11873 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (kind1 != kind)
11875 PyMem_Free(buf1);
11876 if (kind2 != kind)
11877 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011878
11879 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 onError:
11881 Py_DECREF(sep_obj);
11882 Py_DECREF(str_obj);
11883 if (kind1 != kind && buf1)
11884 PyMem_Free(buf1);
11885 if (kind2 != kind && buf2)
11886 PyMem_Free(buf2);
11887 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011888}
11889
11890PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011892\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011893Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011894the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011895found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011896
11897static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011898unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011899{
Victor Stinner9310abb2011-10-05 00:59:23 +020011900 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011901}
11902
11903PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011904 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011905\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011906Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011907the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011908separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011909
11910static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011911unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011912{
Victor Stinner9310abb2011-10-05 00:59:23 +020011913 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011914}
11915
Alexander Belopolsky40018472011-02-26 01:02:56 +000011916PyObject *
11917PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011918{
11919 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011920
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011921 s = PyUnicode_FromObject(s);
11922 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011923 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 if (sep != NULL) {
11925 sep = PyUnicode_FromObject(sep);
11926 if (sep == NULL) {
11927 Py_DECREF(s);
11928 return NULL;
11929 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011930 }
11931
Victor Stinner9310abb2011-10-05 00:59:23 +020011932 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011933
11934 Py_DECREF(s);
11935 Py_XDECREF(sep);
11936 return result;
11937}
11938
11939PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011941\n\
11942Return a list of the words in S, using sep as the\n\
11943delimiter string, starting at the end of the string and\n\
11944working to the front. If maxsplit is given, at most maxsplit\n\
11945splits are done. If sep is not specified, any whitespace string\n\
11946is a separator.");
11947
11948static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011949unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011950{
11951 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011952 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011953
Martin v. Löwis18e16552006-02-15 17:27:45 +000011954 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011955 return NULL;
11956
11957 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011959 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011960 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011961 else
Victor Stinner9310abb2011-10-05 00:59:23 +020011962 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011963}
11964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011965PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967\n\
11968Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011969Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011970is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
11972static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011973unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011975 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011976 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011978 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11979 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980 return NULL;
11981
Guido van Rossum86662912000-04-11 15:38:46 +000011982 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983}
11984
11985static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011986PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987{
Walter Dörwald346737f2007-05-31 10:44:43 +000011988 if (PyUnicode_CheckExact(self)) {
11989 Py_INCREF(self);
11990 return self;
11991 } else
11992 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011993 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994}
11995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011996PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998\n\
11999Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012000and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
12002static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012003unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005 return fixup(self, fixswapcase);
12006}
12007
Georg Brandlceee0772007-11-27 23:48:05 +000012008PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012010\n\
12011Return a translation table usable for str.translate().\n\
12012If there is only one argument, it must be a dictionary mapping Unicode\n\
12013ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012014Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012015If there are two arguments, they must be strings of equal length, and\n\
12016in the resulting dictionary, each character in x will be mapped to the\n\
12017character at the same position in y. If there is a third argument, it\n\
12018must be a string, whose characters will be mapped to None in the result.");
12019
12020static PyObject*
12021unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12022{
12023 PyObject *x, *y = NULL, *z = NULL;
12024 PyObject *new = NULL, *key, *value;
12025 Py_ssize_t i = 0;
12026 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012027
Georg Brandlceee0772007-11-27 23:48:05 +000012028 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12029 return NULL;
12030 new = PyDict_New();
12031 if (!new)
12032 return NULL;
12033 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 int x_kind, y_kind, z_kind;
12035 void *x_data, *y_data, *z_data;
12036
Georg Brandlceee0772007-11-27 23:48:05 +000012037 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012038 if (!PyUnicode_Check(x)) {
12039 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12040 "be a string if there is a second argument");
12041 goto err;
12042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012044 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12045 "arguments must have equal length");
12046 goto err;
12047 }
12048 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 x_kind = PyUnicode_KIND(x);
12050 y_kind = PyUnicode_KIND(y);
12051 x_data = PyUnicode_DATA(x);
12052 y_data = PyUnicode_DATA(y);
12053 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12054 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12055 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012056 if (!key || !value)
12057 goto err;
12058 res = PyDict_SetItem(new, key, value);
12059 Py_DECREF(key);
12060 Py_DECREF(value);
12061 if (res < 0)
12062 goto err;
12063 }
12064 /* create entries for deleting chars in z */
12065 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 z_kind = PyUnicode_KIND(z);
12067 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012068 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012070 if (!key)
12071 goto err;
12072 res = PyDict_SetItem(new, key, Py_None);
12073 Py_DECREF(key);
12074 if (res < 0)
12075 goto err;
12076 }
12077 }
12078 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 int kind;
12080 void *data;
12081
Georg Brandlceee0772007-11-27 23:48:05 +000012082 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012083 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012084 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12085 "to maketrans it must be a dict");
12086 goto err;
12087 }
12088 /* copy entries into the new dict, converting string keys to int keys */
12089 while (PyDict_Next(x, &i, &key, &value)) {
12090 if (PyUnicode_Check(key)) {
12091 /* convert string keys to integer keys */
12092 PyObject *newkey;
12093 if (PyUnicode_GET_SIZE(key) != 1) {
12094 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12095 "table must be of length 1");
12096 goto err;
12097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 kind = PyUnicode_KIND(key);
12099 data = PyUnicode_DATA(key);
12100 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012101 if (!newkey)
12102 goto err;
12103 res = PyDict_SetItem(new, newkey, value);
12104 Py_DECREF(newkey);
12105 if (res < 0)
12106 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012107 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012108 /* just keep integer keys */
12109 if (PyDict_SetItem(new, key, value) < 0)
12110 goto err;
12111 } else {
12112 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12113 "be strings or integers");
12114 goto err;
12115 }
12116 }
12117 }
12118 return new;
12119 err:
12120 Py_DECREF(new);
12121 return NULL;
12122}
12123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012124PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012125 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126\n\
12127Return a copy of the string S, where all characters have been mapped\n\
12128through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012129Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012130Unmapped characters are left untouched. Characters mapped to None\n\
12131are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132
12133static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137}
12138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012139PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012142Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143
12144static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012145unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147 return fixup(self, fixupper);
12148}
12149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012150PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012153Pad a numeric string S with zeros on the left, to fill a field\n\
12154of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155
12156static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012157unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012159 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012160 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012161 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 int kind;
12163 void *data;
12164 Py_UCS4 chr;
12165
12166 if (PyUnicode_READY(self) == -1)
12167 return NULL;
12168
Martin v. Löwis18e16552006-02-15 17:27:45 +000012169 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170 return NULL;
12171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012173 if (PyUnicode_CheckExact(self)) {
12174 Py_INCREF(self);
12175 return (PyObject*) self;
12176 }
12177 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012178 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179 }
12180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
12183 u = pad(self, fill, 0, '0');
12184
Walter Dörwald068325e2002-04-15 13:36:47 +000012185 if (u == NULL)
12186 return NULL;
12187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 kind = PyUnicode_KIND(u);
12189 data = PyUnicode_DATA(u);
12190 chr = PyUnicode_READ(kind, data, fill);
12191
12192 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 PyUnicode_WRITE(kind, data, 0, chr);
12195 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196 }
12197
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012198 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199 return (PyObject*) u;
12200}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
12202#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012203static PyObject *
12204unicode__decimal2ascii(PyObject *self)
12205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012207}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208#endif
12209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012210PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012211 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012213Return True if S starts with the specified prefix, False otherwise.\n\
12214With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012215With optional end, stop comparing S at that position.\n\
12216prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217
12218static PyObject *
12219unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012222 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012224 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012225 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012226 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227
Jesus Ceaac451502011-04-20 17:09:23 +020012228 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012230 if (PyTuple_Check(subobj)) {
12231 Py_ssize_t i;
12232 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12233 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012235 if (substring == NULL)
12236 return NULL;
12237 result = tailmatch(self, substring, start, end, -1);
12238 Py_DECREF(substring);
12239 if (result) {
12240 Py_RETURN_TRUE;
12241 }
12242 }
12243 /* nothing matched */
12244 Py_RETURN_FALSE;
12245 }
12246 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012247 if (substring == NULL) {
12248 if (PyErr_ExceptionMatches(PyExc_TypeError))
12249 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12250 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012252 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012253 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012255 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256}
12257
12258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012259PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012262Return True if S ends with the specified suffix, False otherwise.\n\
12263With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012264With optional end, stop comparing S at that position.\n\
12265suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
12267static PyObject *
12268unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012271 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012273 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012274 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012275 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276
Jesus Ceaac451502011-04-20 17:09:23 +020012277 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012279 if (PyTuple_Check(subobj)) {
12280 Py_ssize_t i;
12281 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12282 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012284 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012286 result = tailmatch(self, substring, start, end, +1);
12287 Py_DECREF(substring);
12288 if (result) {
12289 Py_RETURN_TRUE;
12290 }
12291 }
12292 Py_RETURN_FALSE;
12293 }
12294 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012295 if (substring == NULL) {
12296 if (PyErr_ExceptionMatches(PyExc_TypeError))
12297 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12298 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012300 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012301 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012303 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304}
12305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012307
12308PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012310\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012311Return a formatted version of S, using substitutions from args and kwargs.\n\
12312The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012313
Eric Smith27bbca62010-11-04 17:06:58 +000012314PyDoc_STRVAR(format_map__doc__,
12315 "S.format_map(mapping) -> str\n\
12316\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012317Return a formatted version of S, using substitutions from mapping.\n\
12318The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012319
Eric Smith4a7d76d2008-05-30 18:10:19 +000012320static PyObject *
12321unicode__format__(PyObject* self, PyObject* args)
12322{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012323 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012324
12325 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12326 return NULL;
12327
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012328 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012330 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012331}
12332
Eric Smith8c663262007-08-25 02:26:07 +000012333PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012335\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012336Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012337
12338static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012339unicode__sizeof__(PyUnicodeObject *v)
12340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 Py_ssize_t size;
12342
12343 /* If it's a compact object, account for base structure +
12344 character data. */
12345 if (PyUnicode_IS_COMPACT_ASCII(v))
12346 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12347 else if (PyUnicode_IS_COMPACT(v))
12348 size = sizeof(PyCompactUnicodeObject) +
12349 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12350 else {
12351 /* If it is a two-block object, account for base object, and
12352 for character block if present. */
12353 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012354 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 size += (PyUnicode_GET_LENGTH(v) + 1) *
12356 PyUnicode_CHARACTER_SIZE(v);
12357 }
12358 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012359 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012360 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012362 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012363 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364
12365 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012366}
12367
12368PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012370
12371static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012372unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012373{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012374 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 if (!copy)
12376 return NULL;
12377 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012378}
12379
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380static PyMethodDef unicode_methods[] = {
12381
12382 /* Order is according to common usage: often used methods should
12383 appear first, since lookup is done sequentially. */
12384
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012385 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012386 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12387 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012388 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012389 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12390 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12391 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12392 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12393 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12394 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12395 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012397 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12398 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12399 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012400 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012401 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12402 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12403 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012404 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012405 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012406 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012407 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012408 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12409 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12410 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12411 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12412 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12413 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12414 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12415 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12416 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12417 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12418 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12419 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12420 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12421 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012422 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012423 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012424 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012425 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012426 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012427 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012428 {"maketrans", (PyCFunction) unicode_maketrans,
12429 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012430 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012431#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012432 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433#endif
12434
12435#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012436 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012437 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438#endif
12439
Benjamin Peterson14339b62009-01-31 16:36:08 +000012440 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441 {NULL, NULL}
12442};
12443
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012444static PyObject *
12445unicode_mod(PyObject *v, PyObject *w)
12446{
Brian Curtindfc80e32011-08-10 20:28:54 -050012447 if (!PyUnicode_Check(v))
12448 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012450}
12451
12452static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012453 0, /*nb_add*/
12454 0, /*nb_subtract*/
12455 0, /*nb_multiply*/
12456 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012457};
12458
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012460 (lenfunc) unicode_length, /* sq_length */
12461 PyUnicode_Concat, /* sq_concat */
12462 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12463 (ssizeargfunc) unicode_getitem, /* sq_item */
12464 0, /* sq_slice */
12465 0, /* sq_ass_item */
12466 0, /* sq_ass_slice */
12467 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468};
12469
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012470static PyObject*
12471unicode_subscript(PyUnicodeObject* self, PyObject* item)
12472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 if (PyUnicode_READY(self) == -1)
12474 return NULL;
12475
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012476 if (PyIndex_Check(item)) {
12477 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012478 if (i == -1 && PyErr_Occurred())
12479 return NULL;
12480 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012482 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012483 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012484 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012485 PyObject *result;
12486 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012487 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012488 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012492 return NULL;
12493 }
12494
12495 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 return PyUnicode_New(0, 0);
12497 } else if (start == 0 && step == 1 &&
12498 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012499 PyUnicode_CheckExact(self)) {
12500 Py_INCREF(self);
12501 return (PyObject *)self;
12502 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012503 return PyUnicode_Substring((PyObject*)self,
12504 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012505 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012506 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012507 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012508 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012509 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012510 src_data = PyUnicode_DATA(self);
12511 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12512 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012513 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012514 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012515 if (max_char >= kind_limit)
12516 break;
12517 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012518 }
12519 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012520 if (result == NULL)
12521 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012522 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012523 dest_data = PyUnicode_DATA(result);
12524
12525 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012526 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12527 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012528 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012529 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012530 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012531 } else {
12532 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12533 return NULL;
12534 }
12535}
12536
12537static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012538 (lenfunc)unicode_length, /* mp_length */
12539 (binaryfunc)unicode_subscript, /* mp_subscript */
12540 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012541};
12542
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544/* Helpers for PyUnicode_Format() */
12545
12546static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012547getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012549 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 (*p_argidx)++;
12552 if (arglen < 0)
12553 return args;
12554 else
12555 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556 }
12557 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559 return NULL;
12560}
12561
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012562/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012564static PyObject *
12565formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012567 char *p;
12568 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012570
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571 x = PyFloat_AsDouble(v);
12572 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012573 return NULL;
12574
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012577
Eric Smith0923d1d2009-04-16 20:16:10 +000012578 p = PyOS_double_to_string(x, type, prec,
12579 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012580 if (p == NULL)
12581 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012583 PyMem_Free(p);
12584 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585}
12586
Tim Peters38fd5b62000-09-21 05:43:11 +000012587static PyObject*
12588formatlong(PyObject *val, int flags, int prec, int type)
12589{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012590 char *buf;
12591 int len;
12592 PyObject *str; /* temporary string object. */
12593 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012594
Benjamin Peterson14339b62009-01-31 16:36:08 +000012595 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12596 if (!str)
12597 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012599 Py_DECREF(str);
12600 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012601}
12602
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012605 size_t buflen,
12606 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012608 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012609 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 if (PyUnicode_GET_LENGTH(v) == 1) {
12611 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 buf[1] = '\0';
12613 return 1;
12614 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 goto onError;
12616 }
12617 else {
12618 /* Integer input truncated to a character */
12619 long x;
12620 x = PyLong_AsLong(v);
12621 if (x == -1 && PyErr_Occurred())
12622 goto onError;
12623
12624 if (x < 0 || x > 0x10ffff) {
12625 PyErr_SetString(PyExc_OverflowError,
12626 "%c arg not in range(0x110000)");
12627 return -1;
12628 }
12629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012631 buf[1] = '\0';
12632 return 1;
12633 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012634
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012636 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012638 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639}
12640
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012641/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012642 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012643*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012644#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012645
Alexander Belopolsky40018472011-02-26 01:02:56 +000012646PyObject *
12647PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 void *fmt;
12650 int fmtkind;
12651 PyObject *result;
12652 Py_UCS4 *res, *res0;
12653 Py_UCS4 max;
12654 int kind;
12655 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012659
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 PyErr_BadInternalCall();
12662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12665 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 fmt = PyUnicode_DATA(uformat);
12668 fmtkind = PyUnicode_KIND(uformat);
12669 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12670 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671
12672 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12674 if (res0 == NULL) {
12675 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678
12679 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 arglen = PyTuple_Size(args);
12681 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682 }
12683 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 arglen = -1;
12685 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012687 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012688 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690
12691 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 if (--rescnt < 0) {
12694 rescnt = fmtcnt + 100;
12695 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12697 if (res0 == NULL){
12698 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 }
12701 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012705 }
12706 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 /* Got a format specifier */
12708 int flags = 0;
12709 Py_ssize_t width = -1;
12710 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 Py_UCS4 c = '\0';
12712 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012713 int isnumok;
12714 PyObject *v = NULL;
12715 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 void *pbuf;
12717 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012718 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 Py_ssize_t len, len1;
12720 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 fmtpos++;
12723 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12724 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 Py_ssize_t keylen;
12726 PyObject *key;
12727 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012728
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 if (dict == NULL) {
12730 PyErr_SetString(PyExc_TypeError,
12731 "format requires a mapping");
12732 goto onError;
12733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 /* Skip over balanced parentheses */
12738 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 if (fmtcnt < 0 || pcount > 0) {
12747 PyErr_SetString(PyExc_ValueError,
12748 "incomplete format key");
12749 goto onError;
12750 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012751 key = PyUnicode_Substring((PyObject*)uformat,
12752 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012753 if (key == NULL)
12754 goto onError;
12755 if (args_owned) {
12756 Py_DECREF(args);
12757 args_owned = 0;
12758 }
12759 args = PyObject_GetItem(dict, key);
12760 Py_DECREF(key);
12761 if (args == NULL) {
12762 goto onError;
12763 }
12764 args_owned = 1;
12765 arglen = -1;
12766 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012767 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012768 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 case '-': flags |= F_LJUST; continue;
12771 case '+': flags |= F_SIGN; continue;
12772 case ' ': flags |= F_BLANK; continue;
12773 case '#': flags |= F_ALT; continue;
12774 case '0': flags |= F_ZERO; continue;
12775 }
12776 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012777 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012778 if (c == '*') {
12779 v = getnextarg(args, arglen, &argidx);
12780 if (v == NULL)
12781 goto onError;
12782 if (!PyLong_Check(v)) {
12783 PyErr_SetString(PyExc_TypeError,
12784 "* wants int");
12785 goto onError;
12786 }
12787 width = PyLong_AsLong(v);
12788 if (width == -1 && PyErr_Occurred())
12789 goto onError;
12790 if (width < 0) {
12791 flags |= F_LJUST;
12792 width = -width;
12793 }
12794 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012796 }
12797 else if (c >= '0' && c <= '9') {
12798 width = c - '0';
12799 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 if (c < '0' || c > '9')
12802 break;
12803 if ((width*10) / 10 != width) {
12804 PyErr_SetString(PyExc_ValueError,
12805 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012806 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 }
12808 width = width*10 + (c - '0');
12809 }
12810 }
12811 if (c == '.') {
12812 prec = 0;
12813 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 if (c == '*') {
12816 v = getnextarg(args, arglen, &argidx);
12817 if (v == NULL)
12818 goto onError;
12819 if (!PyLong_Check(v)) {
12820 PyErr_SetString(PyExc_TypeError,
12821 "* wants int");
12822 goto onError;
12823 }
12824 prec = PyLong_AsLong(v);
12825 if (prec == -1 && PyErr_Occurred())
12826 goto onError;
12827 if (prec < 0)
12828 prec = 0;
12829 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 }
12832 else if (c >= '0' && c <= '9') {
12833 prec = c - '0';
12834 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 if (c < '0' || c > '9')
12837 break;
12838 if ((prec*10) / 10 != prec) {
12839 PyErr_SetString(PyExc_ValueError,
12840 "prec too big");
12841 goto onError;
12842 }
12843 prec = prec*10 + (c - '0');
12844 }
12845 }
12846 } /* prec */
12847 if (fmtcnt >= 0) {
12848 if (c == 'h' || c == 'l' || c == 'L') {
12849 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 }
12852 }
12853 if (fmtcnt < 0) {
12854 PyErr_SetString(PyExc_ValueError,
12855 "incomplete format");
12856 goto onError;
12857 }
12858 if (c != '%') {
12859 v = getnextarg(args, arglen, &argidx);
12860 if (v == NULL)
12861 goto onError;
12862 }
12863 sign = 0;
12864 fill = ' ';
12865 switch (c) {
12866
12867 case '%':
12868 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 len = 1;
12873 break;
12874
12875 case 's':
12876 case 'r':
12877 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012878 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 temp = v;
12880 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012881 }
12882 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012883 if (c == 's')
12884 temp = PyObject_Str(v);
12885 else if (c == 'r')
12886 temp = PyObject_Repr(v);
12887 else
12888 temp = PyObject_ASCII(v);
12889 if (temp == NULL)
12890 goto onError;
12891 if (PyUnicode_Check(temp))
12892 /* nothing to do */;
12893 else {
12894 Py_DECREF(temp);
12895 PyErr_SetString(PyExc_TypeError,
12896 "%s argument has non-string str()");
12897 goto onError;
12898 }
12899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 if (PyUnicode_READY(temp) == -1) {
12901 Py_CLEAR(temp);
12902 goto onError;
12903 }
12904 pbuf = PyUnicode_DATA(temp);
12905 kind = PyUnicode_KIND(temp);
12906 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 if (prec >= 0 && len > prec)
12908 len = prec;
12909 break;
12910
12911 case 'i':
12912 case 'd':
12913 case 'u':
12914 case 'o':
12915 case 'x':
12916 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012917 isnumok = 0;
12918 if (PyNumber_Check(v)) {
12919 PyObject *iobj=NULL;
12920
12921 if (PyLong_Check(v)) {
12922 iobj = v;
12923 Py_INCREF(iobj);
12924 }
12925 else {
12926 iobj = PyNumber_Long(v);
12927 }
12928 if (iobj!=NULL) {
12929 if (PyLong_Check(iobj)) {
12930 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012931 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 Py_DECREF(iobj);
12933 if (!temp)
12934 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 if (PyUnicode_READY(temp) == -1) {
12936 Py_CLEAR(temp);
12937 goto onError;
12938 }
12939 pbuf = PyUnicode_DATA(temp);
12940 kind = PyUnicode_KIND(temp);
12941 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 sign = 1;
12943 }
12944 else {
12945 Py_DECREF(iobj);
12946 }
12947 }
12948 }
12949 if (!isnumok) {
12950 PyErr_Format(PyExc_TypeError,
12951 "%%%c format: a number is required, "
12952 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12953 goto onError;
12954 }
12955 if (flags & F_ZERO)
12956 fill = '0';
12957 break;
12958
12959 case 'e':
12960 case 'E':
12961 case 'f':
12962 case 'F':
12963 case 'g':
12964 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012965 temp = formatfloat(v, flags, prec, c);
12966 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012967 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 if (PyUnicode_READY(temp) == -1) {
12969 Py_CLEAR(temp);
12970 goto onError;
12971 }
12972 pbuf = PyUnicode_DATA(temp);
12973 kind = PyUnicode_KIND(temp);
12974 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012975 sign = 1;
12976 if (flags & F_ZERO)
12977 fill = '0';
12978 break;
12979
12980 case 'c':
12981 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012983 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 if (len < 0)
12985 goto onError;
12986 break;
12987
12988 default:
12989 PyErr_Format(PyExc_ValueError,
12990 "unsupported format character '%c' (0x%x) "
12991 "at index %zd",
12992 (31<=c && c<=126) ? (char)c : '?',
12993 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 goto onError;
12996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 /* pbuf is initialized here. */
12998 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012999 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13001 PyUnicode_READ(kind, pbuf, pindex) == '+') {
13002 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013003 len--;
13004 }
13005 else if (flags & F_SIGN)
13006 sign = '+';
13007 else if (flags & F_BLANK)
13008 sign = ' ';
13009 else
13010 sign = 0;
13011 }
13012 if (width < len)
13013 width = len;
13014 if (rescnt - (sign != 0) < width) {
13015 reslen -= rescnt;
13016 rescnt = width + fmtcnt + 100;
13017 reslen += rescnt;
13018 if (reslen < 0) {
13019 Py_XDECREF(temp);
13020 PyErr_NoMemory();
13021 goto onError;
13022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13024 if (res0 == 0) {
13025 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013026 Py_XDECREF(temp);
13027 goto onError;
13028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013030 }
13031 if (sign) {
13032 if (fill != ' ')
13033 *res++ = sign;
13034 rescnt--;
13035 if (width > len)
13036 width--;
13037 }
13038 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13040 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013041 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13043 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013044 }
13045 rescnt -= 2;
13046 width -= 2;
13047 if (width < 0)
13048 width = 0;
13049 len -= 2;
13050 }
13051 if (width > len && !(flags & F_LJUST)) {
13052 do {
13053 --rescnt;
13054 *res++ = fill;
13055 } while (--width > len);
13056 }
13057 if (fill == ' ') {
13058 if (sign)
13059 *res++ = sign;
13060 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13062 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13063 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13064 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013065 }
13066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 /* Copy all characters, preserving len */
13068 len1 = len;
13069 while (len1--) {
13070 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13071 rescnt--;
13072 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013073 while (--width >= len) {
13074 --rescnt;
13075 *res++ = ' ';
13076 }
13077 if (dict && (argidx < arglen) && c != '%') {
13078 PyErr_SetString(PyExc_TypeError,
13079 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013080 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013081 goto onError;
13082 }
13083 Py_XDECREF(temp);
13084 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085 } /* until end */
13086 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013087 PyErr_SetString(PyExc_TypeError,
13088 "not all arguments converted during string formatting");
13089 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090 }
13091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092
13093 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13094 if (*res > max)
13095 max = *res;
13096 result = PyUnicode_New(reslen - rescnt, max);
13097 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013098 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 kind = PyUnicode_KIND(result);
13100 for (res = res0; res < res0+reslen-rescnt; res++)
13101 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13102 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105 }
13106 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013107 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108 return (PyObject *)result;
13109
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 Py_DECREF(uformat);
13113 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115 }
13116 return NULL;
13117}
13118
Jeremy Hylton938ace62002-07-17 16:30:39 +000013119static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013120unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13121
Tim Peters6d6c1a32001-08-02 04:15:00 +000013122static PyObject *
13123unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13124{
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013126 static char *kwlist[] = {"object", "encoding", "errors", 0};
13127 char *encoding = NULL;
13128 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013129
Benjamin Peterson14339b62009-01-31 16:36:08 +000013130 if (type != &PyUnicode_Type)
13131 return unicode_subtype_new(type, args, kwds);
13132 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013134 return NULL;
13135 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013137 if (encoding == NULL && errors == NULL)
13138 return PyObject_Str(x);
13139 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013140 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013141}
13142
Guido van Rossume023fe02001-08-30 03:12:59 +000013143static PyObject *
13144unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13145{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013146 PyUnicodeObject *unicode, *self;
13147 Py_ssize_t length, char_size;
13148 int share_wstr, share_utf8;
13149 unsigned int kind;
13150 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013151
Benjamin Peterson14339b62009-01-31 16:36:08 +000013152 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013153
13154 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13155 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013156 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013157 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013158 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013159 return NULL;
13160
13161 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13162 if (self == NULL) {
13163 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013164 return NULL;
13165 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013166 kind = PyUnicode_KIND(unicode);
13167 length = PyUnicode_GET_LENGTH(unicode);
13168
13169 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013170#ifdef Py_DEBUG
13171 _PyUnicode_HASH(self) = -1;
13172#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013173 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013174#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013175 _PyUnicode_STATE(self).interned = 0;
13176 _PyUnicode_STATE(self).kind = kind;
13177 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013178 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013179 _PyUnicode_STATE(self).ready = 1;
13180 _PyUnicode_WSTR(self) = NULL;
13181 _PyUnicode_UTF8_LENGTH(self) = 0;
13182 _PyUnicode_UTF8(self) = NULL;
13183 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013184 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013185
13186 share_utf8 = 0;
13187 share_wstr = 0;
13188 if (kind == PyUnicode_1BYTE_KIND) {
13189 char_size = 1;
13190 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13191 share_utf8 = 1;
13192 }
13193 else if (kind == PyUnicode_2BYTE_KIND) {
13194 char_size = 2;
13195 if (sizeof(wchar_t) == 2)
13196 share_wstr = 1;
13197 }
13198 else {
13199 assert(kind == PyUnicode_4BYTE_KIND);
13200 char_size = 4;
13201 if (sizeof(wchar_t) == 4)
13202 share_wstr = 1;
13203 }
13204
13205 /* Ensure we won't overflow the length. */
13206 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13207 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013210 data = PyObject_MALLOC((length + 1) * char_size);
13211 if (data == NULL) {
13212 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 goto onError;
13214 }
13215
Victor Stinnerc3c74152011-10-02 20:39:55 +020013216 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013217 if (share_utf8) {
13218 _PyUnicode_UTF8_LENGTH(self) = length;
13219 _PyUnicode_UTF8(self) = data;
13220 }
13221 if (share_wstr) {
13222 _PyUnicode_WSTR_LENGTH(self) = length;
13223 _PyUnicode_WSTR(self) = (wchar_t *)data;
13224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013225
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013226 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13227 PyUnicode_KIND_SIZE(kind, length + 1));
13228 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013229 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013230#ifdef Py_DEBUG
13231 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13232#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013233 return (PyObject *)self;
13234
13235onError:
13236 Py_DECREF(unicode);
13237 Py_DECREF(self);
13238 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013239}
13240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013241PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013243\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013244Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013245encoding defaults to the current default string encoding.\n\
13246errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013247
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013248static PyObject *unicode_iter(PyObject *seq);
13249
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013251 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013252 "str", /* tp_name */
13253 sizeof(PyUnicodeObject), /* tp_size */
13254 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013256 (destructor)unicode_dealloc, /* tp_dealloc */
13257 0, /* tp_print */
13258 0, /* tp_getattr */
13259 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013260 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261 unicode_repr, /* tp_repr */
13262 &unicode_as_number, /* tp_as_number */
13263 &unicode_as_sequence, /* tp_as_sequence */
13264 &unicode_as_mapping, /* tp_as_mapping */
13265 (hashfunc) unicode_hash, /* tp_hash*/
13266 0, /* tp_call*/
13267 (reprfunc) unicode_str, /* tp_str */
13268 PyObject_GenericGetAttr, /* tp_getattro */
13269 0, /* tp_setattro */
13270 0, /* tp_as_buffer */
13271 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 unicode_doc, /* tp_doc */
13274 0, /* tp_traverse */
13275 0, /* tp_clear */
13276 PyUnicode_RichCompare, /* tp_richcompare */
13277 0, /* tp_weaklistoffset */
13278 unicode_iter, /* tp_iter */
13279 0, /* tp_iternext */
13280 unicode_methods, /* tp_methods */
13281 0, /* tp_members */
13282 0, /* tp_getset */
13283 &PyBaseObject_Type, /* tp_base */
13284 0, /* tp_dict */
13285 0, /* tp_descr_get */
13286 0, /* tp_descr_set */
13287 0, /* tp_dictoffset */
13288 0, /* tp_init */
13289 0, /* tp_alloc */
13290 unicode_new, /* tp_new */
13291 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292};
13293
13294/* Initialize the Unicode implementation */
13295
Thomas Wouters78890102000-07-22 19:25:51 +000013296void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013298 int i;
13299
Thomas Wouters477c8d52006-05-27 19:21:47 +000013300 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013302 0x000A, /* LINE FEED */
13303 0x000D, /* CARRIAGE RETURN */
13304 0x001C, /* FILE SEPARATOR */
13305 0x001D, /* GROUP SEPARATOR */
13306 0x001E, /* RECORD SEPARATOR */
13307 0x0085, /* NEXT LINE */
13308 0x2028, /* LINE SEPARATOR */
13309 0x2029, /* PARAGRAPH SEPARATOR */
13310 };
13311
Fred Drakee4315f52000-05-09 19:53:39 +000013312 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013313 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013314 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013315 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013317
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013318 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013319 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013320 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013322
13323 /* initialize the linebreak bloom filter */
13324 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013326 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013327
13328 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329}
13330
13331/* Finalize the Unicode implementation */
13332
Christian Heimesa156e092008-02-16 07:38:31 +000013333int
13334PyUnicode_ClearFreeList(void)
13335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013337}
13338
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339void
Thomas Wouters78890102000-07-22 19:25:51 +000013340_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013342 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013344 Py_XDECREF(unicode_empty);
13345 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013346
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013347 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013348 if (unicode_latin1[i]) {
13349 Py_DECREF(unicode_latin1[i]);
13350 unicode_latin1[i] = NULL;
13351 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013352 }
Christian Heimesa156e092008-02-16 07:38:31 +000013353 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013355
Walter Dörwald16807132007-05-25 13:52:07 +000013356void
13357PyUnicode_InternInPlace(PyObject **p)
13358{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013359 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13360 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013361#ifdef Py_DEBUG
13362 assert(s != NULL);
13363 assert(_PyUnicode_CHECK(s));
13364#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013365 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013366 return;
13367#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013368 /* If it's a subclass, we don't really know what putting
13369 it in the interned dict might do. */
13370 if (!PyUnicode_CheckExact(s))
13371 return;
13372 if (PyUnicode_CHECK_INTERNED(s))
13373 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013374 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013375 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013376 return;
13377 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013378 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013379 if (interned == NULL) {
13380 interned = PyDict_New();
13381 if (interned == NULL) {
13382 PyErr_Clear(); /* Don't leave an exception */
13383 return;
13384 }
13385 }
13386 /* It might be that the GetItem call fails even
13387 though the key is present in the dictionary,
13388 namely when this happens during a stack overflow. */
13389 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013391 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013392
Benjamin Peterson29060642009-01-31 22:14:21 +000013393 if (t) {
13394 Py_INCREF(t);
13395 Py_DECREF(*p);
13396 *p = t;
13397 return;
13398 }
Walter Dörwald16807132007-05-25 13:52:07 +000013399
Benjamin Peterson14339b62009-01-31 16:36:08 +000013400 PyThreadState_GET()->recursion_critical = 1;
13401 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13402 PyErr_Clear();
13403 PyThreadState_GET()->recursion_critical = 0;
13404 return;
13405 }
13406 PyThreadState_GET()->recursion_critical = 0;
13407 /* The two references in interned are not counted by refcnt.
13408 The deallocator will take care of this */
13409 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013411}
13412
13413void
13414PyUnicode_InternImmortal(PyObject **p)
13415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13417
Benjamin Peterson14339b62009-01-31 16:36:08 +000013418 PyUnicode_InternInPlace(p);
13419 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013421 Py_INCREF(*p);
13422 }
Walter Dörwald16807132007-05-25 13:52:07 +000013423}
13424
13425PyObject *
13426PyUnicode_InternFromString(const char *cp)
13427{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013428 PyObject *s = PyUnicode_FromString(cp);
13429 if (s == NULL)
13430 return NULL;
13431 PyUnicode_InternInPlace(&s);
13432 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013433}
13434
Alexander Belopolsky40018472011-02-26 01:02:56 +000013435void
13436_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013437{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 PyObject *keys;
13439 PyUnicodeObject *s;
13440 Py_ssize_t i, n;
13441 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013442
Benjamin Peterson14339b62009-01-31 16:36:08 +000013443 if (interned == NULL || !PyDict_Check(interned))
13444 return;
13445 keys = PyDict_Keys(interned);
13446 if (keys == NULL || !PyList_Check(keys)) {
13447 PyErr_Clear();
13448 return;
13449 }
Walter Dörwald16807132007-05-25 13:52:07 +000013450
Benjamin Peterson14339b62009-01-31 16:36:08 +000013451 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13452 detector, interned unicode strings are not forcibly deallocated;
13453 rather, we give them their stolen references back, and then clear
13454 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013455
Benjamin Peterson14339b62009-01-31 16:36:08 +000013456 n = PyList_GET_SIZE(keys);
13457 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013459 for (i = 0; i < n; i++) {
13460 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013461 if (PyUnicode_READY(s) == -1) {
13462 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013466 case SSTATE_NOT_INTERNED:
13467 /* XXX Shouldn't happen */
13468 break;
13469 case SSTATE_INTERNED_IMMORTAL:
13470 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013471 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013472 break;
13473 case SSTATE_INTERNED_MORTAL:
13474 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013475 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013476 break;
13477 default:
13478 Py_FatalError("Inconsistent interned string state.");
13479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013480 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013481 }
13482 fprintf(stderr, "total size of all interned strings: "
13483 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13484 "mortal/immortal\n", mortal_size, immortal_size);
13485 Py_DECREF(keys);
13486 PyDict_Clear(interned);
13487 Py_DECREF(interned);
13488 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013489}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013490
13491
13492/********************* Unicode Iterator **************************/
13493
13494typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013495 PyObject_HEAD
13496 Py_ssize_t it_index;
13497 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013498} unicodeiterobject;
13499
13500static void
13501unicodeiter_dealloc(unicodeiterobject *it)
13502{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013503 _PyObject_GC_UNTRACK(it);
13504 Py_XDECREF(it->it_seq);
13505 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013506}
13507
13508static int
13509unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13510{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013511 Py_VISIT(it->it_seq);
13512 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013513}
13514
13515static PyObject *
13516unicodeiter_next(unicodeiterobject *it)
13517{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013518 PyUnicodeObject *seq;
13519 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013520
Benjamin Peterson14339b62009-01-31 16:36:08 +000013521 assert(it != NULL);
13522 seq = it->it_seq;
13523 if (seq == NULL)
13524 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013525 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013527 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13528 int kind = PyUnicode_KIND(seq);
13529 void *data = PyUnicode_DATA(seq);
13530 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13531 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013532 if (item != NULL)
13533 ++it->it_index;
13534 return item;
13535 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013536
Benjamin Peterson14339b62009-01-31 16:36:08 +000013537 Py_DECREF(seq);
13538 it->it_seq = NULL;
13539 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013540}
13541
13542static PyObject *
13543unicodeiter_len(unicodeiterobject *it)
13544{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013545 Py_ssize_t len = 0;
13546 if (it->it_seq)
13547 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13548 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013549}
13550
13551PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13552
13553static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013554 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013556 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013557};
13558
13559PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013560 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13561 "str_iterator", /* tp_name */
13562 sizeof(unicodeiterobject), /* tp_basicsize */
13563 0, /* tp_itemsize */
13564 /* methods */
13565 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13566 0, /* tp_print */
13567 0, /* tp_getattr */
13568 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013569 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013570 0, /* tp_repr */
13571 0, /* tp_as_number */
13572 0, /* tp_as_sequence */
13573 0, /* tp_as_mapping */
13574 0, /* tp_hash */
13575 0, /* tp_call */
13576 0, /* tp_str */
13577 PyObject_GenericGetAttr, /* tp_getattro */
13578 0, /* tp_setattro */
13579 0, /* tp_as_buffer */
13580 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13581 0, /* tp_doc */
13582 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13583 0, /* tp_clear */
13584 0, /* tp_richcompare */
13585 0, /* tp_weaklistoffset */
13586 PyObject_SelfIter, /* tp_iter */
13587 (iternextfunc)unicodeiter_next, /* tp_iternext */
13588 unicodeiter_methods, /* tp_methods */
13589 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013590};
13591
13592static PyObject *
13593unicode_iter(PyObject *seq)
13594{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013595 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013596
Benjamin Peterson14339b62009-01-31 16:36:08 +000013597 if (!PyUnicode_Check(seq)) {
13598 PyErr_BadInternalCall();
13599 return NULL;
13600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 if (PyUnicode_READY(seq) == -1)
13602 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013603 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13604 if (it == NULL)
13605 return NULL;
13606 it->it_index = 0;
13607 Py_INCREF(seq);
13608 it->it_seq = (PyUnicodeObject *)seq;
13609 _PyObject_GC_TRACK(it);
13610 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013611}
13612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613#define UNIOP(x) Py_UNICODE_##x
13614#define UNIOP_t Py_UNICODE
13615#include "uniops.h"
13616#undef UNIOP
13617#undef UNIOP_t
13618#define UNIOP(x) Py_UCS4_##x
13619#define UNIOP_t Py_UCS4
13620#include "uniops.h"
13621#undef UNIOP
13622#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013623
Victor Stinner71133ff2010-09-01 23:43:53 +000013624Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013625PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013626{
13627 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13628 Py_UNICODE *copy;
13629 Py_ssize_t size;
13630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 if (!PyUnicode_Check(unicode)) {
13632 PyErr_BadArgument();
13633 return NULL;
13634 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013635 /* Ensure we won't overflow the size. */
13636 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13637 PyErr_NoMemory();
13638 return NULL;
13639 }
13640 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13641 size *= sizeof(Py_UNICODE);
13642 copy = PyMem_Malloc(size);
13643 if (copy == NULL) {
13644 PyErr_NoMemory();
13645 return NULL;
13646 }
13647 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13648 return copy;
13649}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013650
Georg Brandl66c221e2010-10-14 07:04:07 +000013651/* A _string module, to export formatter_parser and formatter_field_name_split
13652 to the string.Formatter class implemented in Python. */
13653
13654static PyMethodDef _string_methods[] = {
13655 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13656 METH_O, PyDoc_STR("split the argument as a field name")},
13657 {"formatter_parser", (PyCFunction) formatter_parser,
13658 METH_O, PyDoc_STR("parse the argument as a format string")},
13659 {NULL, NULL}
13660};
13661
13662static struct PyModuleDef _string_module = {
13663 PyModuleDef_HEAD_INIT,
13664 "_string",
13665 PyDoc_STR("string helper module"),
13666 0,
13667 _string_methods,
13668 NULL,
13669 NULL,
13670 NULL,
13671 NULL
13672};
13673
13674PyMODINIT_FUNC
13675PyInit__string(void)
13676{
13677 return PyModule_Create(&_string_module);
13678}
13679
13680
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013681#ifdef __cplusplus
13682}
13683#endif