blob: 348a83b7a5e27099c31f1b963a0164573a35ff05 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner910337b2011-10-03 03:20:16 +020069#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020070# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020071#else
72# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
73#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020074
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075#define _PyUnicode_UTF8(op) \
76 (((PyCompactUnicodeObject*)(op))->utf8)
77#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020078 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079 assert(PyUnicode_IS_READY(op)), \
80 PyUnicode_IS_COMPACT_ASCII(op) ? \
81 ((char*)((PyASCIIObject*)(op) + 1)) : \
82 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020083#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 (((PyCompactUnicodeObject*)(op))->utf8_length)
85#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020086 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 assert(PyUnicode_IS_READY(op)), \
88 PyUnicode_IS_COMPACT_ASCII(op) ? \
89 ((PyASCIIObject*)(op))->length : \
90 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020091#define _PyUnicode_WSTR(op) \
92 (((PyASCIIObject*)(op))->wstr)
93#define _PyUnicode_WSTR_LENGTH(op) \
94 (((PyCompactUnicodeObject*)(op))->wstr_length)
95#define _PyUnicode_LENGTH(op) \
96 (((PyASCIIObject *)(op))->length)
97#define _PyUnicode_STATE(op) \
98 (((PyASCIIObject *)(op))->state)
99#define _PyUnicode_HASH(op) \
100 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_KIND(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_GET_LENGTH(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200107#define _PyUnicode_DATA_ANY(op) \
108 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109
Victor Stinner910337b2011-10-03 03:20:16 +0200110#undef PyUnicode_READY
111#define PyUnicode_READY(op) \
112 (assert(_PyUnicode_CHECK(op)), \
113 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200114 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100115 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200116
Victor Stinnerc379ead2011-10-03 12:52:27 +0200117#define _PyUnicode_SHARE_UTF8(op) \
118 (assert(_PyUnicode_CHECK(op)), \
119 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
120 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
121#define _PyUnicode_SHARE_WSTR(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
124
Victor Stinner829c0ad2011-10-03 01:08:02 +0200125/* true if the Unicode object has an allocated UTF-8 memory block
126 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_HAS_UTF8_MEMORY(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 (!PyUnicode_IS_COMPACT_ASCII(op) \
130 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200131 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
132
Victor Stinner03490912011-10-03 23:45:12 +0200133/* true if the Unicode object has an allocated wstr memory block
134 (not shared with other data) */
135#define _PyUnicode_HAS_WSTR_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (_PyUnicode_WSTR(op) && \
138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200148 to_type *_to = (to_type *) to; \
149 const from_type *_iter = (begin); \
150 const from_type *_end = (end); \
151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
153 _iter + (n & ~ (Py_ssize_t) 3); \
154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200165/* The Unicode string has been modified: reset the hash */
166#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226
Alexander Belopolsky40018472011-02-26 01:02:56 +0000227static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200228unicode_fromascii(const unsigned char *s, Py_ssize_t size);
229static PyObject *
230_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
235
236static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000237unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000238 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100239 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
241
Alexander Belopolsky40018472011-02-26 01:02:56 +0000242static void
243raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300244 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100245 PyObject *unicode,
246 Py_ssize_t startpos, Py_ssize_t endpos,
247 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000248
Christian Heimes190d79e2008-01-30 11:58:22 +0000249/* Same for linebreaks */
250static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000253/* 0x000B, * LINE TABULATION */
254/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x001C, * FILE SEPARATOR */
259/* 0x001D, * GROUP SEPARATOR */
260/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 1, 1, 1, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000266
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000275};
276
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300277/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
278 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000279Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000280PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000282#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 /* This is actually an illegal character, so it should
286 not be passed to unichr. */
287 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288#endif
289}
290
Victor Stinner910337b2011-10-03 03:20:16 +0200291#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200292int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100293_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200294{
295 PyASCIIObject *ascii;
296 unsigned int kind;
297
298 assert(PyUnicode_Check(op));
299
300 ascii = (PyASCIIObject *)op;
301 kind = ascii->state.kind;
302
Victor Stinnera3b334d2011-10-03 13:53:37 +0200303 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200304 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200305 assert(ascii->state.ready == 1);
306 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200307 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200308 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200309 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200310
Victor Stinnera41463c2011-10-04 01:05:08 +0200311 if (ascii->state.compact == 1) {
312 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(kind == PyUnicode_1BYTE_KIND
314 || kind == PyUnicode_2BYTE_KIND
315 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200316 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 }
320 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
322
323 data = unicode->data.any;
324 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100325 assert(ascii->length == 0);
326 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->wstr != NULL);
332 assert(data == NULL);
333 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200370 /* check that the best kind is used */
371 if (check_content && kind != PyUnicode_WCHAR_KIND)
372 {
373 Py_ssize_t i;
374 Py_UCS4 maxchar = 0;
375 void *data = PyUnicode_DATA(ascii);
376 for (i=0; i < ascii->length; i++)
377 {
378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
379 if (ch > maxchar)
380 maxchar = ch;
381 }
382 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100383 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100385 assert(maxchar <= 255);
386 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 else
388 assert(maxchar < 128);
389 }
Victor Stinner77faf692011-11-20 18:56:05 +0100390 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200391 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100392 assert(maxchar <= 0xFFFF);
393 }
394 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 0x10000);
Victor Stinner0d3721d2011-11-22 03:27:53 +0100396 /* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm()
397 return characters outside the range U+0000-U+10FFFF. */
398 /* assert(maxchar <= 0x10FFFF); */
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinner3a50e702011-10-18 21:21:00 +0200489#ifdef HAVE_MBCS
490static OSVERSIONINFOEX winver;
491#endif
492
Thomas Wouters477c8d52006-05-27 19:21:47 +0000493/* --- Bloom Filters ----------------------------------------------------- */
494
495/* stuff to implement simple "bloom filters" for Unicode characters.
496 to keep things simple, we use a single bitmask, using the least 5
497 bits from each unicode characters as the bit index. */
498
499/* the linebreak mask is set up by Unicode_Init below */
500
Antoine Pitrouf068f942010-01-13 14:19:12 +0000501#if LONG_BIT >= 128
502#define BLOOM_WIDTH 128
503#elif LONG_BIT >= 64
504#define BLOOM_WIDTH 64
505#elif LONG_BIT >= 32
506#define BLOOM_WIDTH 32
507#else
508#error "LONG_BIT is smaller than 32"
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511#define BLOOM_MASK unsigned long
512
513static BLOOM_MASK bloom_linebreak;
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
516#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517
Benjamin Peterson29060642009-01-31 22:14:21 +0000518#define BLOOM_LINEBREAK(ch) \
519 ((ch) < 128U ? ascii_linebreak[(ch)] : \
520 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521
Alexander Belopolsky40018472011-02-26 01:02:56 +0000522Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200523make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524{
525 /* calculate simple bloom-style bitmask for a given unicode string */
526
Antoine Pitrouf068f942010-01-13 14:19:12 +0000527 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528 Py_ssize_t i;
529
530 mask = 0;
531 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000533
534 return mask;
535}
536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537#define BLOOM_MEMBER(mask, chr, str) \
538 (BLOOM(mask, chr) \
539 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200541/* Compilation of templated routines */
542
543#include "stringlib/asciilib.h"
544#include "stringlib/fastsearch.h"
545#include "stringlib/partition.h"
546#include "stringlib/split.h"
547#include "stringlib/count.h"
548#include "stringlib/find.h"
549#include "stringlib/find_max_char.h"
550#include "stringlib/localeutil.h"
551#include "stringlib/undef.h"
552
553#include "stringlib/ucs1lib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs2lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs4lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200583#include "stringlib/unicodedefs.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100587#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200588
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589/* --- Unicode Object ----------------------------------------------------- */
590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200591static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200592fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200594Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
595 Py_ssize_t size, Py_UCS4 ch,
596 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200597{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
599
600 switch (kind) {
601 case PyUnicode_1BYTE_KIND:
602 {
603 Py_UCS1 ch1 = (Py_UCS1) ch;
604 if (ch1 == ch)
605 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
606 else
607 return -1;
608 }
609 case PyUnicode_2BYTE_KIND:
610 {
611 Py_UCS2 ch2 = (Py_UCS2) ch;
612 if (ch2 == ch)
613 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
614 else
615 return -1;
616 }
617 case PyUnicode_4BYTE_KIND:
618 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
619 default:
620 assert(0);
621 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623}
624
Victor Stinnerfe226c02011-10-03 03:52:20 +0200625static PyObject*
626resize_compact(PyObject *unicode, Py_ssize_t length)
627{
628 Py_ssize_t char_size;
629 Py_ssize_t struct_size;
630 Py_ssize_t new_size;
631 int share_wstr;
632
633 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200634 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200635 if (PyUnicode_IS_COMPACT_ASCII(unicode))
636 struct_size = sizeof(PyASCIIObject);
637 else
638 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200639 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640
641 _Py_DEC_REFTOTAL;
642 _Py_ForgetReference(unicode);
643
644 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
645 PyErr_NoMemory();
646 return NULL;
647 }
648 new_size = (struct_size + (length + 1) * char_size);
649
650 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
651 if (unicode == NULL) {
652 PyObject_Del(unicode);
653 PyErr_NoMemory();
654 return NULL;
655 }
656 _Py_NewReference(unicode);
657 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200658 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
661 _PyUnicode_WSTR_LENGTH(unicode) = length;
662 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
664 length, 0);
665 return unicode;
666}
667
Alexander Belopolsky40018472011-02-26 01:02:56 +0000668static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200669resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670{
Victor Stinner95663112011-10-04 01:03:50 +0200671 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000674
Victor Stinner95663112011-10-04 01:03:50 +0200675 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
677 if (PyUnicode_IS_READY(unicode)) {
678 Py_ssize_t char_size;
679 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200680 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 void *data;
682
683 data = _PyUnicode_DATA_ANY(unicode);
684 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200685 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200686 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
687 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200688 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
689 {
690 PyObject_DEL(_PyUnicode_UTF8(unicode));
691 _PyUnicode_UTF8(unicode) = NULL;
692 _PyUnicode_UTF8_LENGTH(unicode) = 0;
693 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694
695 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
696 PyErr_NoMemory();
697 return -1;
698 }
699 new_size = (length + 1) * char_size;
700
701 data = (PyObject *)PyObject_REALLOC(data, new_size);
702 if (data == NULL) {
703 PyErr_NoMemory();
704 return -1;
705 }
706 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200709 _PyUnicode_WSTR_LENGTH(unicode) = length;
710 }
711 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200712 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200713 _PyUnicode_UTF8_LENGTH(unicode) = length;
714 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 _PyUnicode_LENGTH(unicode) = length;
716 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200717 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200718 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 }
Victor Stinner95663112011-10-04 01:03:50 +0200722 assert(_PyUnicode_WSTR(unicode) != NULL);
723
724 /* check for integer overflow */
725 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
726 PyErr_NoMemory();
727 return -1;
728 }
729 wstr = _PyUnicode_WSTR(unicode);
730 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
731 if (!wstr) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 _PyUnicode_WSTR(unicode) = wstr;
736 _PyUnicode_WSTR(unicode)[length] = 0;
737 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200738 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000739 return 0;
740}
741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742static PyObject*
743resize_copy(PyObject *unicode, Py_ssize_t length)
744{
745 Py_ssize_t copy_length;
746 if (PyUnicode_IS_COMPACT(unicode)) {
747 PyObject *copy;
748 assert(PyUnicode_IS_READY(unicode));
749
750 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
751 if (copy == NULL)
752 return NULL;
753
754 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200755 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200756 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200757 }
758 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 assert(_PyUnicode_WSTR(unicode) != NULL);
761 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200762 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200763 if (w == NULL)
764 return NULL;
765 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
766 copy_length = Py_MIN(copy_length, length);
767 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
768 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200769 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200770 }
771}
772
Guido van Rossumd57fd912000-03-10 22:53:23 +0000773/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000774 Ux0000 terminated; some code (e.g. new_identifier)
775 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776
777 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000778 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000779
780*/
781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200782#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200783static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784#endif
785
Alexander Belopolsky40018472011-02-26 01:02:56 +0000786static PyUnicodeObject *
787_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000788{
789 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200790 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791
Thomas Wouters477c8d52006-05-27 19:21:47 +0000792 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793 if (length == 0 && unicode_empty != NULL) {
794 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200795 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796 }
797
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000798 /* Ensure we won't overflow the size. */
799 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
800 return (PyUnicodeObject *)PyErr_NoMemory();
801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 if (length < 0) {
803 PyErr_SetString(PyExc_SystemError,
804 "Negative size passed to _PyUnicode_New");
805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806 }
807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808#ifdef Py_DEBUG
809 ++unicode_old_new_calls;
810#endif
811
812 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
813 if (unicode == NULL)
814 return NULL;
815 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
816 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
817 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000818 PyErr_NoMemory();
819 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821
Jeremy Hyltond8082792003-09-16 19:41:39 +0000822 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000823 * the caller fails before initializing str -- unicode_resize()
824 * reads str[0], and the Keep-Alive optimization can keep memory
825 * allocated for str alive across a call to unicode_dealloc(unicode).
826 * We don't want unicode_resize to read uninitialized memory in
827 * that case.
828 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200829 _PyUnicode_WSTR(unicode)[0] = 0;
830 _PyUnicode_WSTR(unicode)[length] = 0;
831 _PyUnicode_WSTR_LENGTH(unicode) = length;
832 _PyUnicode_HASH(unicode) = -1;
833 _PyUnicode_STATE(unicode).interned = 0;
834 _PyUnicode_STATE(unicode).kind = 0;
835 _PyUnicode_STATE(unicode).compact = 0;
836 _PyUnicode_STATE(unicode).ready = 0;
837 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200838 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200840 _PyUnicode_UTF8(unicode) = NULL;
841 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100842 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000844
Benjamin Peterson29060642009-01-31 22:14:21 +0000845 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000846 /* XXX UNREF/NEWREF interface should be more symmetrical */
847 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000848 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000849 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851}
852
Victor Stinnerf42dc442011-10-02 23:33:16 +0200853static const char*
854unicode_kind_name(PyObject *unicode)
855{
Victor Stinner42dfd712011-10-03 14:41:45 +0200856 /* don't check consistency: unicode_kind_name() is called from
857 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200858 if (!PyUnicode_IS_COMPACT(unicode))
859 {
860 if (!PyUnicode_IS_READY(unicode))
861 return "wstr";
862 switch(PyUnicode_KIND(unicode))
863 {
864 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200865 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866 return "legacy ascii";
867 else
868 return "legacy latin1";
869 case PyUnicode_2BYTE_KIND:
870 return "legacy UCS2";
871 case PyUnicode_4BYTE_KIND:
872 return "legacy UCS4";
873 default:
874 return "<legacy invalid kind>";
875 }
876 }
877 assert(PyUnicode_IS_READY(unicode));
878 switch(PyUnicode_KIND(unicode))
879 {
880 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200881 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882 return "ascii";
883 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200884 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200886 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200888 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200889 default:
890 return "<invalid compact kind>";
891 }
892}
893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200895static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896
897/* Functions wrapping macros for use in debugger */
898char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200899 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900}
901
902void *_PyUnicode_compact_data(void *unicode) {
903 return _PyUnicode_COMPACT_DATA(unicode);
904}
905void *_PyUnicode_data(void *unicode){
906 printf("obj %p\n", unicode);
907 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
908 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
909 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
910 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
911 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
912 return PyUnicode_DATA(unicode);
913}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200914
915void
916_PyUnicode_Dump(PyObject *op)
917{
918 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200919 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
920 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
921 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200922
Victor Stinnera849a4b2011-10-03 12:12:11 +0200923 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200924 {
925 if (ascii->state.ascii)
926 data = (ascii + 1);
927 else
928 data = (compact + 1);
929 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 else
931 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200932 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
933
Victor Stinnera849a4b2011-10-03 12:12:11 +0200934 if (ascii->wstr == data)
935 printf("shared ");
936 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200937
Victor Stinnera3b334d2011-10-03 13:53:37 +0200938 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 printf(" (%zu), ", compact->wstr_length);
940 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
941 printf("shared ");
942 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200943 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946#endif
947
948PyObject *
949PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
950{
951 PyObject *obj;
952 PyCompactUnicodeObject *unicode;
953 void *data;
954 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200955 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956 Py_ssize_t char_size;
957 Py_ssize_t struct_size;
958
959 /* Optimization for empty strings */
960 if (size == 0 && unicode_empty != NULL) {
961 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200962 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 }
964
965#ifdef Py_DEBUG
966 ++unicode_new_new_calls;
967#endif
968
Victor Stinner9e9d6892011-10-04 01:02:02 +0200969 is_ascii = 0;
970 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 struct_size = sizeof(PyCompactUnicodeObject);
972 if (maxchar < 128) {
973 kind_state = PyUnicode_1BYTE_KIND;
974 char_size = 1;
975 is_ascii = 1;
976 struct_size = sizeof(PyASCIIObject);
977 }
978 else if (maxchar < 256) {
979 kind_state = PyUnicode_1BYTE_KIND;
980 char_size = 1;
981 }
982 else if (maxchar < 65536) {
983 kind_state = PyUnicode_2BYTE_KIND;
984 char_size = 2;
985 if (sizeof(wchar_t) == 2)
986 is_sharing = 1;
987 }
988 else {
989 kind_state = PyUnicode_4BYTE_KIND;
990 char_size = 4;
991 if (sizeof(wchar_t) == 4)
992 is_sharing = 1;
993 }
994
995 /* Ensure we won't overflow the size. */
996 if (size < 0) {
997 PyErr_SetString(PyExc_SystemError,
998 "Negative size passed to PyUnicode_New");
999 return NULL;
1000 }
1001 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1002 return PyErr_NoMemory();
1003
1004 /* Duplicated allocation code from _PyObject_New() instead of a call to
1005 * PyObject_New() so we are able to allocate space for the object and
1006 * it's data buffer.
1007 */
1008 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1009 if (obj == NULL)
1010 return PyErr_NoMemory();
1011 obj = PyObject_INIT(obj, &PyUnicode_Type);
1012 if (obj == NULL)
1013 return NULL;
1014
1015 unicode = (PyCompactUnicodeObject *)obj;
1016 if (is_ascii)
1017 data = ((PyASCIIObject*)obj) + 1;
1018 else
1019 data = unicode + 1;
1020 _PyUnicode_LENGTH(unicode) = size;
1021 _PyUnicode_HASH(unicode) = -1;
1022 _PyUnicode_STATE(unicode).interned = 0;
1023 _PyUnicode_STATE(unicode).kind = kind_state;
1024 _PyUnicode_STATE(unicode).compact = 1;
1025 _PyUnicode_STATE(unicode).ready = 1;
1026 _PyUnicode_STATE(unicode).ascii = is_ascii;
1027 if (is_ascii) {
1028 ((char*)data)[size] = 0;
1029 _PyUnicode_WSTR(unicode) = NULL;
1030 }
1031 else if (kind_state == PyUnicode_1BYTE_KIND) {
1032 ((char*)data)[size] = 0;
1033 _PyUnicode_WSTR(unicode) = NULL;
1034 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001036 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 }
1038 else {
1039 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001040 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 if (kind_state == PyUnicode_2BYTE_KIND)
1042 ((Py_UCS2*)data)[size] = 0;
1043 else /* kind_state == PyUnicode_4BYTE_KIND */
1044 ((Py_UCS4*)data)[size] = 0;
1045 if (is_sharing) {
1046 _PyUnicode_WSTR_LENGTH(unicode) = size;
1047 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1048 }
1049 else {
1050 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1051 _PyUnicode_WSTR(unicode) = NULL;
1052 }
1053 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001054 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 return obj;
1056}
1057
1058#if SIZEOF_WCHAR_T == 2
1059/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1060 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001061 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062
1063 This function assumes that unicode can hold one more code point than wstr
1064 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001065static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001067 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068{
1069 const wchar_t *iter;
1070 Py_UCS4 *ucs4_out;
1071
Victor Stinner910337b2011-10-03 03:20:16 +02001072 assert(unicode != NULL);
1073 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1075 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1076
1077 for (iter = begin; iter < end; ) {
1078 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1079 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001080 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1081 && (iter+1) < end
1082 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083 {
Victor Stinner551ac952011-11-29 22:58:13 +01001084 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 iter += 2;
1086 }
1087 else {
1088 *ucs4_out++ = *iter;
1089 iter++;
1090 }
1091 }
1092 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1093 _PyUnicode_GET_LENGTH(unicode)));
1094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095}
1096#endif
1097
Victor Stinnercd9950f2011-10-02 00:34:53 +02001098static int
1099_PyUnicode_Dirty(PyObject *unicode)
1100{
Victor Stinner910337b2011-10-03 03:20:16 +02001101 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001102 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001103 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001104 "Cannot modify a string having more than 1 reference");
1105 return -1;
1106 }
1107 _PyUnicode_DIRTY(unicode);
1108 return 0;
1109}
1110
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001111static int
1112_copy_characters(PyObject *to, Py_ssize_t to_start,
1113 PyObject *from, Py_ssize_t from_start,
1114 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001116 unsigned int from_kind, to_kind;
1117 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001118 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001120 assert(PyUnicode_Check(from));
1121 assert(PyUnicode_Check(to));
1122 assert(PyUnicode_IS_READY(from));
1123 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001125 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1126 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1127 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001129 if (how_many == 0)
1130 return 0;
1131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001133 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001137#ifdef Py_DEBUG
1138 if (!check_maxchar
1139 && (from_kind > to_kind
1140 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1143 Py_UCS4 ch;
1144 Py_ssize_t i;
1145 for (i=0; i < how_many; i++) {
1146 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1147 assert(ch <= to_maxchar);
1148 }
1149 }
1150#endif
1151 fast = (from_kind == to_kind);
1152 if (check_maxchar
1153 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1154 {
1155 /* deny latin1 => ascii */
1156 fast = 0;
1157 }
1158
1159 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001160 Py_MEMCPY((char*)to_data + to_kind * to_start,
1161 (char*)from_data + from_kind * from_start,
1162 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001164 else if (from_kind == PyUnicode_1BYTE_KIND
1165 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001166 {
1167 _PyUnicode_CONVERT_BYTES(
1168 Py_UCS1, Py_UCS2,
1169 PyUnicode_1BYTE_DATA(from) + from_start,
1170 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1171 PyUnicode_2BYTE_DATA(to) + to_start
1172 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001173 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001175 && to_kind == PyUnicode_4BYTE_KIND)
1176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS4,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_4BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
1184 else if (from_kind == PyUnicode_2BYTE_KIND
1185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS2, Py_UCS4,
1189 PyUnicode_2BYTE_DATA(from) + from_start,
1190 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001194 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001195 /* check if max_char(from substring) <= max_char(to) */
1196 if (from_kind > to_kind
1197 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001198 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001199 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 /* slow path to check for character overflow */
1201 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001203 Py_ssize_t i;
1204
Victor Stinner56c161a2011-10-06 02:47:11 +02001205#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001206 for (i=0; i < how_many; i++) {
1207 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001208 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1210 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001211#else
1212 if (!check_maxchar) {
1213 for (i=0; i < how_many; i++) {
1214 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1215 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1216 }
1217 }
1218 else {
1219 for (i=0; i < how_many; i++) {
1220 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1221 if (ch > to_maxchar)
1222 return 1;
1223 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1224 }
1225 }
1226#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001227 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001228 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001229 assert(0 && "inconsistent state");
1230 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 }
1232 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001233 return 0;
1234}
1235
1236static void
1237copy_characters(PyObject *to, Py_ssize_t to_start,
1238 PyObject *from, Py_ssize_t from_start,
1239 Py_ssize_t how_many)
1240{
1241 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1242}
1243
1244Py_ssize_t
1245PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1246 PyObject *from, Py_ssize_t from_start,
1247 Py_ssize_t how_many)
1248{
1249 int err;
1250
1251 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1252 PyErr_BadInternalCall();
1253 return -1;
1254 }
1255
1256 if (PyUnicode_READY(from))
1257 return -1;
1258 if (PyUnicode_READY(to))
1259 return -1;
1260
1261 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1262 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1263 PyErr_Format(PyExc_SystemError,
1264 "Cannot write %zi characters at %zi "
1265 "in a string of %zi characters",
1266 how_many, to_start, PyUnicode_GET_LENGTH(to));
1267 return -1;
1268 }
1269
1270 if (how_many == 0)
1271 return 0;
1272
1273 if (_PyUnicode_Dirty(to))
1274 return -1;
1275
1276 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1277 if (err) {
1278 PyErr_Format(PyExc_SystemError,
1279 "Cannot copy %s characters "
1280 "into a string of %s characters",
1281 unicode_kind_name(from),
1282 unicode_kind_name(to));
1283 return -1;
1284 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001285 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286}
1287
Victor Stinner17222162011-09-28 22:15:37 +02001288/* Find the maximum code point and count the number of surrogate pairs so a
1289 correct string length can be computed before converting a string to UCS4.
1290 This function counts single surrogates as a character and not as a pair.
1291
1292 Return 0 on success, or -1 on error. */
1293static int
1294find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1295 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296{
1297 const wchar_t *iter;
1298
Victor Stinnerc53be962011-10-02 21:33:54 +02001299 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001300 *num_surrogates = 0;
1301 *maxchar = 0;
1302
1303 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001304 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001306#if SIZEOF_WCHAR_T != 2
1307 if (*maxchar >= 0x10000)
1308 return 0;
1309#endif
1310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001312 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1313 && (iter+1) < end
1314 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 {
1316 Py_UCS4 surrogate_val;
Victor Stinnerca4f2072011-11-22 03:38:40 +01001317 surrogate_val = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 ++(*num_surrogates);
1319 if (surrogate_val > *maxchar)
1320 *maxchar = surrogate_val;
1321 iter += 2;
1322 }
1323 else
1324 iter++;
1325#else
1326 iter++;
1327#endif
1328 }
1329 return 0;
1330}
1331
1332#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001333static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334#endif
1335
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001336int
1337_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338{
1339 wchar_t *end;
1340 Py_UCS4 maxchar = 0;
1341 Py_ssize_t num_surrogates;
1342#if SIZEOF_WCHAR_T == 2
1343 Py_ssize_t length_wo_surrogates;
1344#endif
1345
Georg Brandl7597add2011-10-05 16:36:47 +02001346 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001347 strings were created using _PyObject_New() and where no canonical
1348 representation (the str field) has been set yet aka strings
1349 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001350 assert(_PyUnicode_CHECK(unicode));
1351 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001353 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001354 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001355 /* Actually, it should neither be interned nor be anything else: */
1356 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357
1358#ifdef Py_DEBUG
1359 ++unicode_ready_calls;
1360#endif
1361
1362 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001363 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001364 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366
1367 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001368 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1369 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 PyErr_NoMemory();
1371 return -1;
1372 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001373 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 _PyUnicode_WSTR(unicode), end,
1375 PyUnicode_1BYTE_DATA(unicode));
1376 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1377 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1378 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1379 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001380 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001381 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001382 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 }
1384 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001385 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001386 _PyUnicode_UTF8(unicode) = NULL;
1387 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 }
1389 PyObject_FREE(_PyUnicode_WSTR(unicode));
1390 _PyUnicode_WSTR(unicode) = NULL;
1391 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1392 }
1393 /* In this case we might have to convert down from 4-byte native
1394 wchar_t to 2-byte unicode. */
1395 else if (maxchar < 65536) {
1396 assert(num_surrogates == 0 &&
1397 "FindMaxCharAndNumSurrogatePairs() messed up");
1398
Victor Stinner506f5922011-09-28 22:34:18 +02001399#if SIZEOF_WCHAR_T == 2
1400 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001401 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001402 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1403 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1404 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001405 _PyUnicode_UTF8(unicode) = NULL;
1406 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001407#else
1408 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001409 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001410 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001412 PyErr_NoMemory();
1413 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 }
Victor Stinner506f5922011-09-28 22:34:18 +02001415 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1416 _PyUnicode_WSTR(unicode), end,
1417 PyUnicode_2BYTE_DATA(unicode));
1418 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1419 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1420 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001421 _PyUnicode_UTF8(unicode) = NULL;
1422 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyObject_FREE(_PyUnicode_WSTR(unicode));
1424 _PyUnicode_WSTR(unicode) = NULL;
1425 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1426#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 }
1428 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1429 else {
1430#if SIZEOF_WCHAR_T == 2
1431 /* in case the native representation is 2-bytes, we need to allocate a
1432 new normalized 4-byte version. */
1433 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001434 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1435 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 PyErr_NoMemory();
1437 return -1;
1438 }
1439 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1440 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001441 _PyUnicode_UTF8(unicode) = NULL;
1442 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001443 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1444 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001445 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 PyObject_FREE(_PyUnicode_WSTR(unicode));
1447 _PyUnicode_WSTR(unicode) = NULL;
1448 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1449#else
1450 assert(num_surrogates == 0);
1451
Victor Stinnerc3c74152011-10-02 20:39:55 +02001452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001454 _PyUnicode_UTF8(unicode) = NULL;
1455 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1457#endif
1458 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1459 }
1460 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001461 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 return 0;
1463}
1464
Alexander Belopolsky40018472011-02-26 01:02:56 +00001465static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001466unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467{
Walter Dörwald16807132007-05-25 13:52:07 +00001468 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001469 case SSTATE_NOT_INTERNED:
1470 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001471
Benjamin Peterson29060642009-01-31 22:14:21 +00001472 case SSTATE_INTERNED_MORTAL:
1473 /* revive dead object temporarily for DelItem */
1474 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001475 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001476 Py_FatalError(
1477 "deletion of interned string failed");
1478 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001479
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_INTERNED_IMMORTAL:
1481 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 default:
1484 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001485 }
1486
Victor Stinner03490912011-10-03 23:45:12 +02001487 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001489 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001490 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491
1492 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001493 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 }
1495 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001496 if (_PyUnicode_DATA_ANY(unicode))
1497 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001498 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499 }
1500}
1501
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001502#ifdef Py_DEBUG
1503static int
1504unicode_is_singleton(PyObject *unicode)
1505{
1506 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1507 if (unicode == unicode_empty)
1508 return 1;
1509 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1510 {
1511 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1512 if (ch < 256 && unicode_latin1[ch] == unicode)
1513 return 1;
1514 }
1515 return 0;
1516}
1517#endif
1518
Alexander Belopolsky40018472011-02-26 01:02:56 +00001519static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001520unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001521{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001522 if (Py_REFCNT(unicode) != 1)
1523 return 0;
1524 if (PyUnicode_CHECK_INTERNED(unicode))
1525 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001526#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001527 /* singleton refcount is greater than 1 */
1528 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001529#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001530 return 1;
1531}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001532
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533static int
1534unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1535{
1536 PyObject *unicode;
1537 Py_ssize_t old_length;
1538
1539 assert(p_unicode != NULL);
1540 unicode = *p_unicode;
1541
1542 assert(unicode != NULL);
1543 assert(PyUnicode_Check(unicode));
1544 assert(0 <= length);
1545
Victor Stinner910337b2011-10-03 03:20:16 +02001546 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001547 old_length = PyUnicode_WSTR_LENGTH(unicode);
1548 else
1549 old_length = PyUnicode_GET_LENGTH(unicode);
1550 if (old_length == length)
1551 return 0;
1552
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001553 if (length == 0) {
1554 Py_DECREF(*p_unicode);
1555 *p_unicode = unicode_empty;
1556 Py_INCREF(*p_unicode);
1557 return 0;
1558 }
1559
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560 if (!unicode_resizable(unicode)) {
1561 PyObject *copy = resize_copy(unicode, length);
1562 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001564 Py_DECREF(*p_unicode);
1565 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001566 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567 }
1568
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 if (PyUnicode_IS_COMPACT(unicode)) {
1570 *p_unicode = resize_compact(unicode, length);
1571 if (*p_unicode == NULL)
1572 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001573 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001574 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001575 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001576 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001577}
1578
Alexander Belopolsky40018472011-02-26 01:02:56 +00001579int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001581{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001582 PyObject *unicode;
1583 if (p_unicode == NULL) {
1584 PyErr_BadInternalCall();
1585 return -1;
1586 }
1587 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001588 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001589 {
1590 PyErr_BadInternalCall();
1591 return -1;
1592 }
1593 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001594}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001595
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001596static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001597unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001598{
1599 PyObject *result;
1600 assert(PyUnicode_IS_READY(*p_unicode));
1601 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1602 return 0;
1603 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1604 maxchar);
1605 if (result == NULL)
1606 return -1;
1607 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1608 PyUnicode_GET_LENGTH(*p_unicode));
1609 Py_DECREF(*p_unicode);
1610 *p_unicode = result;
1611 return 0;
1612}
1613
1614static int
1615unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1616 Py_UCS4 ch)
1617{
1618 if (unicode_widen(p_unicode, ch) < 0)
1619 return -1;
1620 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1621 PyUnicode_DATA(*p_unicode),
1622 (*pos)++, ch);
1623 return 0;
1624}
1625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626static PyObject*
1627get_latin1_char(unsigned char ch)
1628{
Victor Stinnera464fc12011-10-02 20:39:30 +02001629 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001631 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 if (!unicode)
1633 return NULL;
1634 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001635 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 unicode_latin1[ch] = unicode;
1637 }
1638 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001639 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640}
1641
Alexander Belopolsky40018472011-02-26 01:02:56 +00001642PyObject *
1643PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001645 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 Py_UCS4 maxchar = 0;
1647 Py_ssize_t num_surrogates;
1648
1649 if (u == NULL)
1650 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001652 /* If the Unicode data is known at construction time, we can apply
1653 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 /* Optimization for empty strings */
1656 if (size == 0 && unicode_empty != NULL) {
1657 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001658 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001659 }
Tim Petersced69f82003-09-16 20:30:58 +00001660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 /* Single character Unicode objects in the Latin-1 range are
1662 shared when using this constructor */
1663 if (size == 1 && *u < 256)
1664 return get_latin1_char((unsigned char)*u);
1665
1666 /* If not empty and not single character, copy the Unicode data
1667 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001668 if (find_maxchar_surrogates(u, u + size,
1669 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 return NULL;
1671
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001672 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 if (!unicode)
1675 return NULL;
1676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677 switch (PyUnicode_KIND(unicode)) {
1678 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001679 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1681 break;
1682 case PyUnicode_2BYTE_KIND:
1683#if Py_UNICODE_SIZE == 2
1684 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1685#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001686 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1688#endif
1689 break;
1690 case PyUnicode_4BYTE_KIND:
1691#if SIZEOF_WCHAR_T == 2
1692 /* This is the only case which has to process surrogates, thus
1693 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001694 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695#else
1696 assert(num_surrogates == 0);
1697 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1698#endif
1699 break;
1700 default:
1701 assert(0 && "Impossible state");
1702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001704 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705}
1706
Alexander Belopolsky40018472011-02-26 01:02:56 +00001707PyObject *
1708PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001709{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001710 if (size < 0) {
1711 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001712 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001713 return NULL;
1714 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001715
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001716 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001717 some optimizations which share commonly used objects.
1718 Also, this means the input must be UTF-8, so fall back to the
1719 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720 if (u != NULL) {
1721
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 /* Optimization for empty strings */
1723 if (size == 0 && unicode_empty != NULL) {
1724 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001725 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001726 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001727
1728 /* Single characters are shared when using this constructor.
1729 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001730 if (size == 1 && (unsigned char)*u < 128)
1731 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001732
1733 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001734 }
1735
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001736 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001737}
1738
Alexander Belopolsky40018472011-02-26 01:02:56 +00001739PyObject *
1740PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741{
1742 size_t size = strlen(u);
1743 if (size > PY_SSIZE_T_MAX) {
1744 PyErr_SetString(PyExc_OverflowError, "input too long");
1745 return NULL;
1746 }
1747
1748 return PyUnicode_FromStringAndSize(u, size);
1749}
1750
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001751PyObject *
1752_PyUnicode_FromId(_Py_Identifier *id)
1753{
1754 if (!id->object) {
1755 id->object = PyUnicode_FromString(id->string);
1756 if (!id->object)
1757 return NULL;
1758 PyUnicode_InternInPlace(&id->object);
1759 assert(!id->next);
1760 id->next = static_strings;
1761 static_strings = id;
1762 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001763 return id->object;
1764}
1765
1766void
1767_PyUnicode_ClearStaticStrings()
1768{
1769 _Py_Identifier *i;
1770 for (i = static_strings; i; i = i->next) {
1771 Py_DECREF(i->object);
1772 i->object = NULL;
1773 i->next = NULL;
1774 }
1775}
1776
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001777/* Internal function, don't check maximum character */
1778
Victor Stinnere57b1c02011-09-28 22:20:48 +02001779static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001781{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001782 PyObject *res;
1783#ifdef Py_DEBUG
1784 const unsigned char *p;
1785 const unsigned char *end = s + size;
1786 for (p=s; p < end; p++) {
1787 assert(*p < 128);
1788 }
1789#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001790 if (size == 1)
1791 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001792 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001793 if (!res)
1794 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001795 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001796 return res;
1797}
1798
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001799static Py_UCS4
1800kind_maxchar_limit(unsigned int kind)
1801{
1802 switch(kind) {
1803 case PyUnicode_1BYTE_KIND:
1804 return 0x80;
1805 case PyUnicode_2BYTE_KIND:
1806 return 0x100;
1807 case PyUnicode_4BYTE_KIND:
1808 return 0x10000;
1809 default:
1810 assert(0 && "invalid kind");
1811 return 0x10ffff;
1812 }
1813}
1814
Victor Stinner702c7342011-10-05 13:50:52 +02001815static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001816_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001819 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001820
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001821 if (size == 0) {
1822 Py_INCREF(unicode_empty);
1823 return unicode_empty;
1824 }
1825 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001826 if (size == 1)
1827 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001828
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001829 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001830 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 if (!res)
1832 return NULL;
1833 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001834 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001836}
1837
Victor Stinnere57b1c02011-09-28 22:20:48 +02001838static PyObject*
1839_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840{
1841 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001842 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001843
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001844 if (size == 0) {
1845 Py_INCREF(unicode_empty);
1846 return unicode_empty;
1847 }
1848 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001849 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001850 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001851
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001852 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001853 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 if (!res)
1855 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001856 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001858 else {
1859 _PyUnicode_CONVERT_BYTES(
1860 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1861 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001862 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 return res;
1864}
1865
Victor Stinnere57b1c02011-09-28 22:20:48 +02001866static PyObject*
1867_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868{
1869 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001871
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001872 if (size == 0) {
1873 Py_INCREF(unicode_empty);
1874 return unicode_empty;
1875 }
1876 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001877 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001878 return get_latin1_char((unsigned char)u[0]);
1879
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001880 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001881 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001882 if (!res)
1883 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001884 if (max_char < 256)
1885 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1886 PyUnicode_1BYTE_DATA(res));
1887 else if (max_char < 0x10000)
1888 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1889 PyUnicode_2BYTE_DATA(res));
1890 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001892 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 return res;
1894}
1895
1896PyObject*
1897PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1898{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001899 if (size < 0) {
1900 PyErr_SetString(PyExc_ValueError, "size must be positive");
1901 return NULL;
1902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 switch(kind) {
1904 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001905 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001907 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001909 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001910 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911 PyErr_SetString(PyExc_SystemError, "invalid kind");
1912 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914}
1915
Victor Stinner25a4b292011-10-06 12:31:55 +02001916/* Ensure that a string uses the most efficient storage, if it is not the
1917 case: create a new string with of the right kind. Write NULL into *p_unicode
1918 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001919static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001920unicode_adjust_maxchar(PyObject **p_unicode)
1921{
1922 PyObject *unicode, *copy;
1923 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001924 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001925 unsigned int kind;
1926
1927 assert(p_unicode != NULL);
1928 unicode = *p_unicode;
1929 assert(PyUnicode_IS_READY(unicode));
1930 if (PyUnicode_IS_ASCII(unicode))
1931 return;
1932
1933 len = PyUnicode_GET_LENGTH(unicode);
1934 kind = PyUnicode_KIND(unicode);
1935 if (kind == PyUnicode_1BYTE_KIND) {
1936 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs1lib_find_max_char(u, u + len);
1938 if (max_char >= 128)
1939 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001940 }
1941 else if (kind == PyUnicode_2BYTE_KIND) {
1942 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001943 max_char = ucs2lib_find_max_char(u, u + len);
1944 if (max_char >= 256)
1945 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 }
1947 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001948 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001949 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001950 max_char = ucs4lib_find_max_char(u, u + len);
1951 if (max_char >= 0x10000)
1952 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001953 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001954 copy = PyUnicode_New(len, max_char);
1955 copy_characters(copy, 0, unicode, 0, len);
1956 Py_DECREF(unicode);
1957 *p_unicode = copy;
1958}
1959
Victor Stinner034f6cf2011-09-30 02:26:44 +02001960PyObject*
1961PyUnicode_Copy(PyObject *unicode)
1962{
Victor Stinner87af4f22011-11-21 23:03:47 +01001963 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001965
Victor Stinner034f6cf2011-09-30 02:26:44 +02001966 if (!PyUnicode_Check(unicode)) {
1967 PyErr_BadInternalCall();
1968 return NULL;
1969 }
1970 if (PyUnicode_READY(unicode))
1971 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001972
Victor Stinner87af4f22011-11-21 23:03:47 +01001973 length = PyUnicode_GET_LENGTH(unicode);
1974 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001975 if (!copy)
1976 return NULL;
1977 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1978
Victor Stinner87af4f22011-11-21 23:03:47 +01001979 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1980 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001981 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001982 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001983}
1984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985
Victor Stinnerbc603d12011-10-02 01:00:40 +02001986/* Widen Unicode objects to larger buffers. Don't write terminating null
1987 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988
1989void*
1990_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1991{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001992 Py_ssize_t len;
1993 void *result;
1994 unsigned int skind;
1995
1996 if (PyUnicode_READY(s))
1997 return NULL;
1998
1999 len = PyUnicode_GET_LENGTH(s);
2000 skind = PyUnicode_KIND(s);
2001 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002002 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 return NULL;
2004 }
2005 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002006 case PyUnicode_2BYTE_KIND:
2007 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2008 if (!result)
2009 return PyErr_NoMemory();
2010 assert(skind == PyUnicode_1BYTE_KIND);
2011 _PyUnicode_CONVERT_BYTES(
2012 Py_UCS1, Py_UCS2,
2013 PyUnicode_1BYTE_DATA(s),
2014 PyUnicode_1BYTE_DATA(s) + len,
2015 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002017 case PyUnicode_4BYTE_KIND:
2018 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2019 if (!result)
2020 return PyErr_NoMemory();
2021 if (skind == PyUnicode_2BYTE_KIND) {
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS2, Py_UCS4,
2024 PyUnicode_2BYTE_DATA(s),
2025 PyUnicode_2BYTE_DATA(s) + len,
2026 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002028 else {
2029 assert(skind == PyUnicode_1BYTE_KIND);
2030 _PyUnicode_CONVERT_BYTES(
2031 Py_UCS1, Py_UCS4,
2032 PyUnicode_1BYTE_DATA(s),
2033 PyUnicode_1BYTE_DATA(s) + len,
2034 result);
2035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002037 default:
2038 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 }
Victor Stinner01698042011-10-04 00:04:26 +02002040 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 return NULL;
2042}
2043
2044static Py_UCS4*
2045as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2046 int copy_null)
2047{
2048 int kind;
2049 void *data;
2050 Py_ssize_t len, targetlen;
2051 if (PyUnicode_READY(string) == -1)
2052 return NULL;
2053 kind = PyUnicode_KIND(string);
2054 data = PyUnicode_DATA(string);
2055 len = PyUnicode_GET_LENGTH(string);
2056 targetlen = len;
2057 if (copy_null)
2058 targetlen++;
2059 if (!target) {
2060 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2061 PyErr_NoMemory();
2062 return NULL;
2063 }
2064 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2065 if (!target) {
2066 PyErr_NoMemory();
2067 return NULL;
2068 }
2069 }
2070 else {
2071 if (targetsize < targetlen) {
2072 PyErr_Format(PyExc_SystemError,
2073 "string is longer than the buffer");
2074 if (copy_null && 0 < targetsize)
2075 target[0] = 0;
2076 return NULL;
2077 }
2078 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002079 if (kind == PyUnicode_1BYTE_KIND) {
2080 Py_UCS1 *start = (Py_UCS1 *) data;
2081 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002083 else if (kind == PyUnicode_2BYTE_KIND) {
2084 Py_UCS2 *start = (Py_UCS2 *) data;
2085 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2086 }
2087 else {
2088 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (copy_null)
2092 target[len] = 0;
2093 return target;
2094}
2095
2096Py_UCS4*
2097PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2098 int copy_null)
2099{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002100 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 PyErr_BadInternalCall();
2102 return NULL;
2103 }
2104 return as_ucs4(string, target, targetsize, copy_null);
2105}
2106
2107Py_UCS4*
2108PyUnicode_AsUCS4Copy(PyObject *string)
2109{
2110 return as_ucs4(string, NULL, 0, 1);
2111}
2112
2113#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002114
Alexander Belopolsky40018472011-02-26 01:02:56 +00002115PyObject *
2116PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002120 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002121 PyErr_BadInternalCall();
2122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 }
2124
Martin v. Löwis790465f2008-04-05 20:41:37 +00002125 if (size == -1) {
2126 size = wcslen(w);
2127 }
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130}
2131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002133
Walter Dörwald346737f2007-05-31 10:44:43 +00002134static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002135makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2136 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002137{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002138 *fmt++ = '%';
2139 if (width) {
2140 if (zeropad)
2141 *fmt++ = '0';
2142 fmt += sprintf(fmt, "%d", width);
2143 }
2144 if (precision)
2145 fmt += sprintf(fmt, ".%d", precision);
2146 if (longflag)
2147 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002148 else if (longlongflag) {
2149 /* longlongflag should only ever be nonzero on machines with
2150 HAVE_LONG_LONG defined */
2151#ifdef HAVE_LONG_LONG
2152 char *f = PY_FORMAT_LONG_LONG;
2153 while (*f)
2154 *fmt++ = *f++;
2155#else
2156 /* we shouldn't ever get here */
2157 assert(0);
2158 *fmt++ = 'l';
2159#endif
2160 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002161 else if (size_tflag) {
2162 char *f = PY_FORMAT_SIZE_T;
2163 while (*f)
2164 *fmt++ = *f++;
2165 }
2166 *fmt++ = c;
2167 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002168}
2169
Victor Stinner96865452011-03-01 23:44:09 +00002170/* helper for PyUnicode_FromFormatV() */
2171
2172static const char*
2173parse_format_flags(const char *f,
2174 int *p_width, int *p_precision,
2175 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2176{
2177 int width, precision, longflag, longlongflag, size_tflag;
2178
2179 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2180 f++;
2181 width = 0;
2182 while (Py_ISDIGIT((unsigned)*f))
2183 width = (width*10) + *f++ - '0';
2184 precision = 0;
2185 if (*f == '.') {
2186 f++;
2187 while (Py_ISDIGIT((unsigned)*f))
2188 precision = (precision*10) + *f++ - '0';
2189 if (*f == '%') {
2190 /* "%.3%s" => f points to "3" */
2191 f--;
2192 }
2193 }
2194 if (*f == '\0') {
2195 /* bogus format "%.1" => go backward, f points to "1" */
2196 f--;
2197 }
2198 if (p_width != NULL)
2199 *p_width = width;
2200 if (p_precision != NULL)
2201 *p_precision = precision;
2202
2203 /* Handle %ld, %lu, %lld and %llu. */
2204 longflag = 0;
2205 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002206 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002207
2208 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longflag = 1;
2211 ++f;
2212 }
2213#ifdef HAVE_LONG_LONG
2214 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002215 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002216 longlongflag = 1;
2217 f += 2;
2218 }
2219#endif
2220 }
2221 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002222 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002223 size_tflag = 1;
2224 ++f;
2225 }
2226 if (p_longflag != NULL)
2227 *p_longflag = longflag;
2228 if (p_longlongflag != NULL)
2229 *p_longlongflag = longlongflag;
2230 if (p_size_tflag != NULL)
2231 *p_size_tflag = size_tflag;
2232 return f;
2233}
2234
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002235/* maximum number of characters required for output of %ld. 21 characters
2236 allows for 64-bit integers (in decimal) and an optional sign. */
2237#define MAX_LONG_CHARS 21
2238/* maximum number of characters required for output of %lld.
2239 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2240 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2241#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2242
Walter Dörwaldd2034312007-05-18 16:29:38 +00002243PyObject *
2244PyUnicode_FromFormatV(const char *format, va_list vargs)
2245{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002246 va_list count;
2247 Py_ssize_t callcount = 0;
2248 PyObject **callresults = NULL;
2249 PyObject **callresult = NULL;
2250 Py_ssize_t n = 0;
2251 int width = 0;
2252 int precision = 0;
2253 int zeropad;
2254 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002255 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002257 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2259 Py_UCS4 argmaxchar;
2260 Py_ssize_t numbersize = 0;
2261 char *numberresults = NULL;
2262 char *numberresult = NULL;
2263 Py_ssize_t i;
2264 int kind;
2265 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002266
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002267 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002268 /* step 1: count the number of %S/%R/%A/%s format specifications
2269 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2270 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002272 * also estimate a upper bound for all the number formats in the string,
2273 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002275 for (f = format; *f; f++) {
2276 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002277 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2279 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2280 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2281 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002284#ifdef HAVE_LONG_LONG
2285 if (longlongflag) {
2286 if (width < MAX_LONG_LONG_CHARS)
2287 width = MAX_LONG_LONG_CHARS;
2288 }
2289 else
2290#endif
2291 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2292 including sign. Decimal takes the most space. This
2293 isn't enough for octal. If a width is specified we
2294 need more (which we allocate later). */
2295 if (width < MAX_LONG_CHARS)
2296 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297
2298 /* account for the size + '\0' to separate numbers
2299 inside of the numberresults buffer */
2300 numbersize += (width + 1);
2301 }
2302 }
2303 else if ((unsigned char)*f > 127) {
2304 PyErr_Format(PyExc_ValueError,
2305 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2306 "string, got a non-ASCII byte: 0x%02x",
2307 (unsigned char)*f);
2308 return NULL;
2309 }
2310 }
2311 /* step 2: allocate memory for the results of
2312 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2313 if (callcount) {
2314 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2315 if (!callresults) {
2316 PyErr_NoMemory();
2317 return NULL;
2318 }
2319 callresult = callresults;
2320 }
2321 /* step 2.5: allocate memory for the results of formating numbers */
2322 if (numbersize) {
2323 numberresults = PyObject_Malloc(numbersize);
2324 if (!numberresults) {
2325 PyErr_NoMemory();
2326 goto fail;
2327 }
2328 numberresult = numberresults;
2329 }
2330
2331 /* step 3: format numbers and figure out how large a buffer we need */
2332 for (f = format; *f; f++) {
2333 if (*f == '%') {
2334 const char* p;
2335 int longflag;
2336 int longlongflag;
2337 int size_tflag;
2338 int numprinted;
2339
2340 p = f;
2341 zeropad = (f[1] == '0');
2342 f = parse_format_flags(f, &width, &precision,
2343 &longflag, &longlongflag, &size_tflag);
2344 switch (*f) {
2345 case 'c':
2346 {
2347 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002348 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 n++;
2350 break;
2351 }
2352 case '%':
2353 n++;
2354 break;
2355 case 'i':
2356 case 'd':
2357 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2358 width, precision, *f);
2359 if (longflag)
2360 numprinted = sprintf(numberresult, fmt,
2361 va_arg(count, long));
2362#ifdef HAVE_LONG_LONG
2363 else if (longlongflag)
2364 numprinted = sprintf(numberresult, fmt,
2365 va_arg(count, PY_LONG_LONG));
2366#endif
2367 else if (size_tflag)
2368 numprinted = sprintf(numberresult, fmt,
2369 va_arg(count, Py_ssize_t));
2370 else
2371 numprinted = sprintf(numberresult, fmt,
2372 va_arg(count, int));
2373 n += numprinted;
2374 /* advance by +1 to skip over the '\0' */
2375 numberresult += (numprinted + 1);
2376 assert(*(numberresult - 1) == '\0');
2377 assert(*(numberresult - 2) != '\0');
2378 assert(numprinted >= 0);
2379 assert(numberresult <= numberresults + numbersize);
2380 break;
2381 case 'u':
2382 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2383 width, precision, 'u');
2384 if (longflag)
2385 numprinted = sprintf(numberresult, fmt,
2386 va_arg(count, unsigned long));
2387#ifdef HAVE_LONG_LONG
2388 else if (longlongflag)
2389 numprinted = sprintf(numberresult, fmt,
2390 va_arg(count, unsigned PY_LONG_LONG));
2391#endif
2392 else if (size_tflag)
2393 numprinted = sprintf(numberresult, fmt,
2394 va_arg(count, size_t));
2395 else
2396 numprinted = sprintf(numberresult, fmt,
2397 va_arg(count, unsigned int));
2398 n += numprinted;
2399 numberresult += (numprinted + 1);
2400 assert(*(numberresult - 1) == '\0');
2401 assert(*(numberresult - 2) != '\0');
2402 assert(numprinted >= 0);
2403 assert(numberresult <= numberresults + numbersize);
2404 break;
2405 case 'x':
2406 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2407 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2408 n += numprinted;
2409 numberresult += (numprinted + 1);
2410 assert(*(numberresult - 1) == '\0');
2411 assert(*(numberresult - 2) != '\0');
2412 assert(numprinted >= 0);
2413 assert(numberresult <= numberresults + numbersize);
2414 break;
2415 case 'p':
2416 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2417 /* %p is ill-defined: ensure leading 0x. */
2418 if (numberresult[1] == 'X')
2419 numberresult[1] = 'x';
2420 else if (numberresult[1] != 'x') {
2421 memmove(numberresult + 2, numberresult,
2422 strlen(numberresult) + 1);
2423 numberresult[0] = '0';
2424 numberresult[1] = 'x';
2425 numprinted += 2;
2426 }
2427 n += numprinted;
2428 numberresult += (numprinted + 1);
2429 assert(*(numberresult - 1) == '\0');
2430 assert(*(numberresult - 2) != '\0');
2431 assert(numprinted >= 0);
2432 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002433 break;
2434 case 's':
2435 {
2436 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002437 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002438 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2439 if (!str)
2440 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 /* since PyUnicode_DecodeUTF8 returns already flexible
2442 unicode objects, there is no need to call ready on them */
2443 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002444 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002446 /* Remember the str and switch to the next slot */
2447 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002448 break;
2449 }
2450 case 'U':
2451 {
2452 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002453 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 if (PyUnicode_READY(obj) == -1)
2455 goto fail;
2456 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002457 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 break;
2460 }
2461 case 'V':
2462 {
2463 PyObject *obj = va_arg(count, PyObject *);
2464 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002465 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002466 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002467 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (PyUnicode_READY(obj) == -1)
2470 goto fail;
2471 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002472 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002474 *callresult++ = NULL;
2475 }
2476 else {
2477 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2478 if (!str_obj)
2479 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002480 if (PyUnicode_READY(str_obj)) {
2481 Py_DECREF(str_obj);
2482 goto fail;
2483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002485 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002487 *callresult++ = str_obj;
2488 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 break;
2490 }
2491 case 'S':
2492 {
2493 PyObject *obj = va_arg(count, PyObject *);
2494 PyObject *str;
2495 assert(obj);
2496 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002498 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002500 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002502 /* Remember the str and switch to the next slot */
2503 *callresult++ = str;
2504 break;
2505 }
2506 case 'R':
2507 {
2508 PyObject *obj = va_arg(count, PyObject *);
2509 PyObject *repr;
2510 assert(obj);
2511 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002513 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002514 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002515 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002517 /* Remember the repr and switch to the next slot */
2518 *callresult++ = repr;
2519 break;
2520 }
2521 case 'A':
2522 {
2523 PyObject *obj = va_arg(count, PyObject *);
2524 PyObject *ascii;
2525 assert(obj);
2526 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002528 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002529 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002530 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002532 /* Remember the repr and switch to the next slot */
2533 *callresult++ = ascii;
2534 break;
2535 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002536 default:
2537 /* if we stumble upon an unknown
2538 formatting code, copy the rest of
2539 the format string to the output
2540 string. (we cannot just skip the
2541 code, since there's no way to know
2542 what's in the argument list) */
2543 n += strlen(p);
2544 goto expand;
2545 }
2546 } else
2547 n++;
2548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002549 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002550 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002552 we don't have to resize the string.
2553 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002554 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 if (!string)
2556 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 kind = PyUnicode_KIND(string);
2558 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002559 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002560 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002564 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002565
2566 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2568 /* checking for == because the last argument could be a empty
2569 string, which causes i to point to end, the assert at the end of
2570 the loop */
2571 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002572
Benjamin Peterson14339b62009-01-31 16:36:08 +00002573 switch (*f) {
2574 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002575 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 const int ordinal = va_arg(vargs, int);
2577 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002578 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002579 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002580 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 case 'p':
2585 /* unused, since we already have the result */
2586 if (*f == 'p')
2587 (void) va_arg(vargs, void *);
2588 else
2589 (void) va_arg(vargs, int);
2590 /* extract the result from numberresults and append. */
2591 for (; *numberresult; ++i, ++numberresult)
2592 PyUnicode_WRITE(kind, data, i, *numberresult);
2593 /* skip over the separating '\0' */
2594 assert(*numberresult == '\0');
2595 numberresult++;
2596 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 break;
2598 case 's':
2599 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002600 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002602 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 size = PyUnicode_GET_LENGTH(*callresult);
2604 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002605 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002607 /* We're done with the unicode()/repr() => forget it */
2608 Py_DECREF(*callresult);
2609 /* switch to next unicode()/repr() result */
2610 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002611 break;
2612 }
2613 case 'U':
2614 {
2615 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 Py_ssize_t size;
2617 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2618 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002619 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 break;
2622 }
2623 case 'V':
2624 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002627 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629 size = PyUnicode_GET_LENGTH(obj);
2630 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002631 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002633 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 size = PyUnicode_GET_LENGTH(*callresult);
2635 assert(PyUnicode_KIND(*callresult) <=
2636 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002637 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002639 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002641 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 break;
2643 }
2644 case 'S':
2645 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002646 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002648 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 /* unused, since we already have the result */
2650 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002652 copy_characters(string, i, *callresult, 0, size);
2653 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 /* We're done with the unicode()/repr() => forget it */
2655 Py_DECREF(*callresult);
2656 /* switch to next unicode()/repr() result */
2657 ++callresult;
2658 break;
2659 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 break;
2663 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 for (; *p; ++p, ++i)
2665 PyUnicode_WRITE(kind, data, i, *p);
2666 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 goto end;
2668 }
Victor Stinner1205f272010-09-11 00:54:47 +00002669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 else {
2671 assert(i < PyUnicode_GET_LENGTH(string));
2672 PyUnicode_WRITE(kind, data, i++, *f);
2673 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002676
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 if (callresults)
2679 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 if (numberresults)
2681 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002682 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 if (callresults) {
2685 PyObject **callresult2 = callresults;
2686 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002687 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 ++callresult2;
2689 }
2690 PyObject_Free(callresults);
2691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 if (numberresults)
2693 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002695}
2696
Walter Dörwaldd2034312007-05-18 16:29:38 +00002697PyObject *
2698PyUnicode_FromFormat(const char *format, ...)
2699{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 PyObject* ret;
2701 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002702
2703#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 ret = PyUnicode_FromFormatV(format, vargs);
2709 va_end(vargs);
2710 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002711}
2712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713#ifdef HAVE_WCHAR_H
2714
Victor Stinner5593d8a2010-10-02 11:11:27 +00002715/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2716 convert a Unicode object to a wide character string.
2717
Victor Stinnerd88d9832011-09-06 02:00:05 +02002718 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002719 character) required to convert the unicode object. Ignore size argument.
2720
Victor Stinnerd88d9832011-09-06 02:00:05 +02002721 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002722 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002723 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002725unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002726 wchar_t *w,
2727 Py_ssize_t size)
2728{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 const wchar_t *wstr;
2731
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002732 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 if (wstr == NULL)
2734 return -1;
2735
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002737 if (size > res)
2738 size = res + 1;
2739 else
2740 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 return res;
2743 }
2744 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002746}
2747
2748Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002749PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002750 wchar_t *w,
2751 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752{
2753 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002754 PyErr_BadInternalCall();
2755 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002757 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758}
2759
Victor Stinner137c34c2010-09-29 10:25:54 +00002760wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002761PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002762 Py_ssize_t *size)
2763{
2764 wchar_t* buffer;
2765 Py_ssize_t buflen;
2766
2767 if (unicode == NULL) {
2768 PyErr_BadInternalCall();
2769 return NULL;
2770 }
2771
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002772 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 if (buflen == -1)
2774 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002775 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002776 PyErr_NoMemory();
2777 return NULL;
2778 }
2779
Victor Stinner137c34c2010-09-29 10:25:54 +00002780 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2781 if (buffer == NULL) {
2782 PyErr_NoMemory();
2783 return NULL;
2784 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002785 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 if (buflen == -1)
2787 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002788 if (size != NULL)
2789 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002790 return buffer;
2791}
2792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794
Alexander Belopolsky40018472011-02-26 01:02:56 +00002795PyObject *
2796PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002799 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002800 PyErr_SetString(PyExc_ValueError,
2801 "chr() arg not in range(0x110000)");
2802 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002803 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002805 if (ordinal < 256)
2806 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 v = PyUnicode_New(1, ordinal);
2809 if (v == NULL)
2810 return NULL;
2811 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002812 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002814}
2815
Alexander Belopolsky40018472011-02-26 01:02:56 +00002816PyObject *
2817PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002819 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002820 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002821 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002822 if (PyUnicode_READY(obj))
2823 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 Py_INCREF(obj);
2825 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002826 }
2827 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 /* For a Unicode subtype that's not a Unicode object,
2829 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002830 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002831 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002832 PyErr_Format(PyExc_TypeError,
2833 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002834 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002835 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002836}
2837
Alexander Belopolsky40018472011-02-26 01:02:56 +00002838PyObject *
2839PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002840 const char *encoding,
2841 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002842{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002843 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002845
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 PyErr_BadInternalCall();
2848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002851 /* Decoding bytes objects is the most common case and should be fast */
2852 if (PyBytes_Check(obj)) {
2853 if (PyBytes_GET_SIZE(obj) == 0) {
2854 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002855 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002856 }
2857 else {
2858 v = PyUnicode_Decode(
2859 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2860 encoding, errors);
2861 }
2862 return v;
2863 }
2864
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002865 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002866 PyErr_SetString(PyExc_TypeError,
2867 "decoding str is not supported");
2868 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002869 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002870
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002871 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2872 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2873 PyErr_Format(PyExc_TypeError,
2874 "coercing to str: need bytes, bytearray "
2875 "or buffer-like object, %.80s found",
2876 Py_TYPE(obj)->tp_name);
2877 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002878 }
Tim Petersced69f82003-09-16 20:30:58 +00002879
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002880 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002882 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 }
Tim Petersced69f82003-09-16 20:30:58 +00002884 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002885 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002886
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002887 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002888 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889}
2890
Victor Stinner600d3be2010-06-10 12:00:55 +00002891/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002892 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2893 1 on success. */
2894static int
2895normalize_encoding(const char *encoding,
2896 char *lower,
2897 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002899 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002900 char *l;
2901 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002902
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002903 if (encoding == NULL) {
2904 strcpy(lower, "utf-8");
2905 return 1;
2906 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002907 e = encoding;
2908 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002909 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002910 while (*e) {
2911 if (l == l_end)
2912 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002913 if (Py_ISUPPER(*e)) {
2914 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002915 }
2916 else if (*e == '_') {
2917 *l++ = '-';
2918 e++;
2919 }
2920 else {
2921 *l++ = *e++;
2922 }
2923 }
2924 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002925 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002926}
2927
Alexander Belopolsky40018472011-02-26 01:02:56 +00002928PyObject *
2929PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002930 Py_ssize_t size,
2931 const char *encoding,
2932 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002933{
2934 PyObject *buffer = NULL, *unicode;
2935 Py_buffer info;
2936 char lower[11]; /* Enough for any encoding shortcut */
2937
Fred Drakee4315f52000-05-09 19:53:39 +00002938 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002939 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002940 if ((strcmp(lower, "utf-8") == 0) ||
2941 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002942 return PyUnicode_DecodeUTF8(s, size, errors);
2943 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002944 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002945 (strcmp(lower, "iso-8859-1") == 0))
2946 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002947#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002948 else if (strcmp(lower, "mbcs") == 0)
2949 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002950#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002951 else if (strcmp(lower, "ascii") == 0)
2952 return PyUnicode_DecodeASCII(s, size, errors);
2953 else if (strcmp(lower, "utf-16") == 0)
2954 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2955 else if (strcmp(lower, "utf-32") == 0)
2956 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958
2959 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002960 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002961 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002962 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002963 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 if (buffer == NULL)
2965 goto onError;
2966 unicode = PyCodec_Decode(buffer, encoding, errors);
2967 if (unicode == NULL)
2968 goto onError;
2969 if (!PyUnicode_Check(unicode)) {
2970 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002971 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002972 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 Py_DECREF(unicode);
2974 goto onError;
2975 }
2976 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002977 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002978
Benjamin Peterson29060642009-01-31 22:14:21 +00002979 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 Py_XDECREF(buffer);
2981 return NULL;
2982}
2983
Alexander Belopolsky40018472011-02-26 01:02:56 +00002984PyObject *
2985PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002986 const char *encoding,
2987 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002988{
2989 PyObject *v;
2990
2991 if (!PyUnicode_Check(unicode)) {
2992 PyErr_BadArgument();
2993 goto onError;
2994 }
2995
2996 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998
2999 /* Decode via the codec registry */
3000 v = PyCodec_Decode(unicode, encoding, errors);
3001 if (v == NULL)
3002 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003003 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003004
Benjamin Peterson29060642009-01-31 22:14:21 +00003005 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003006 return NULL;
3007}
3008
Alexander Belopolsky40018472011-02-26 01:02:56 +00003009PyObject *
3010PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003011 const char *encoding,
3012 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003013{
3014 PyObject *v;
3015
3016 if (!PyUnicode_Check(unicode)) {
3017 PyErr_BadArgument();
3018 goto onError;
3019 }
3020
3021 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003023
3024 /* Decode via the codec registry */
3025 v = PyCodec_Decode(unicode, encoding, errors);
3026 if (v == NULL)
3027 goto onError;
3028 if (!PyUnicode_Check(v)) {
3029 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003030 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003031 Py_TYPE(v)->tp_name);
3032 Py_DECREF(v);
3033 goto onError;
3034 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003035 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003036
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003038 return NULL;
3039}
3040
Alexander Belopolsky40018472011-02-26 01:02:56 +00003041PyObject *
3042PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003043 Py_ssize_t size,
3044 const char *encoding,
3045 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046{
3047 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003048
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 unicode = PyUnicode_FromUnicode(s, size);
3050 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3053 Py_DECREF(unicode);
3054 return v;
3055}
3056
Alexander Belopolsky40018472011-02-26 01:02:56 +00003057PyObject *
3058PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003059 const char *encoding,
3060 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003061{
3062 PyObject *v;
3063
3064 if (!PyUnicode_Check(unicode)) {
3065 PyErr_BadArgument();
3066 goto onError;
3067 }
3068
3069 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071
3072 /* Encode via the codec registry */
3073 v = PyCodec_Encode(unicode, encoding, errors);
3074 if (v == NULL)
3075 goto onError;
3076 return v;
3077
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003079 return NULL;
3080}
3081
Victor Stinnerad158722010-10-27 00:25:46 +00003082PyObject *
3083PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003084{
Victor Stinner99b95382011-07-04 14:23:54 +02003085#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003086 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003087#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003088 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003089#else
Victor Stinner793b5312011-04-27 00:24:21 +02003090 PyInterpreterState *interp = PyThreadState_GET()->interp;
3091 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3092 cannot use it to encode and decode filenames before it is loaded. Load
3093 the Python codec requires to encode at least its own filename. Use the C
3094 version of the locale codec until the codec registry is initialized and
3095 the Python codec is loaded.
3096
3097 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3098 cannot only rely on it: check also interp->fscodec_initialized for
3099 subinterpreters. */
3100 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003101 return PyUnicode_AsEncodedString(unicode,
3102 Py_FileSystemDefaultEncoding,
3103 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003104 }
3105 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003106 /* locale encoding with surrogateescape */
3107 wchar_t *wchar;
3108 char *bytes;
3109 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003110 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003111
3112 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3113 if (wchar == NULL)
3114 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003115 bytes = _Py_wchar2char(wchar, &error_pos);
3116 if (bytes == NULL) {
3117 if (error_pos != (size_t)-1) {
3118 char *errmsg = strerror(errno);
3119 PyObject *exc = NULL;
3120 if (errmsg == NULL)
3121 errmsg = "Py_wchar2char() failed";
3122 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003123 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003124 error_pos, error_pos+1,
3125 errmsg);
3126 Py_XDECREF(exc);
3127 }
3128 else
3129 PyErr_NoMemory();
3130 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003131 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003132 }
3133 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003134
3135 bytes_obj = PyBytes_FromString(bytes);
3136 PyMem_Free(bytes);
3137 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003138 }
Victor Stinnerad158722010-10-27 00:25:46 +00003139#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003140}
3141
Alexander Belopolsky40018472011-02-26 01:02:56 +00003142PyObject *
3143PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003144 const char *encoding,
3145 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146{
3147 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003148 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003149
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 if (!PyUnicode_Check(unicode)) {
3151 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 }
Fred Drakee4315f52000-05-09 19:53:39 +00003154
Fred Drakee4315f52000-05-09 19:53:39 +00003155 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003156 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003157 if ((strcmp(lower, "utf-8") == 0) ||
3158 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003159 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003160 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003161 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003162 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003164 }
Victor Stinner37296e82010-06-10 13:36:23 +00003165 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003166 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003167 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003169#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003170 else if (strcmp(lower, "mbcs") == 0)
3171 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003172#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003173 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003174 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176
3177 /* Encode via the codec registry */
3178 v = PyCodec_Encode(unicode, encoding, errors);
3179 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003180 return NULL;
3181
3182 /* The normal path */
3183 if (PyBytes_Check(v))
3184 return v;
3185
3186 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003187 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003188 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003189 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003190
3191 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3192 "encoder %s returned bytearray instead of bytes",
3193 encoding);
3194 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003195 Py_DECREF(v);
3196 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003197 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003198
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003199 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3200 Py_DECREF(v);
3201 return b;
3202 }
3203
3204 PyErr_Format(PyExc_TypeError,
3205 "encoder did not return a bytes object (type=%.400s)",
3206 Py_TYPE(v)->tp_name);
3207 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003208 return NULL;
3209}
3210
Alexander Belopolsky40018472011-02-26 01:02:56 +00003211PyObject *
3212PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003213 const char *encoding,
3214 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003215{
3216 PyObject *v;
3217
3218 if (!PyUnicode_Check(unicode)) {
3219 PyErr_BadArgument();
3220 goto onError;
3221 }
3222
3223 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003225
3226 /* Encode via the codec registry */
3227 v = PyCodec_Encode(unicode, encoding, errors);
3228 if (v == NULL)
3229 goto onError;
3230 if (!PyUnicode_Check(v)) {
3231 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003232 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003233 Py_TYPE(v)->tp_name);
3234 Py_DECREF(v);
3235 goto onError;
3236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003238
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 return NULL;
3241}
3242
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003243PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003244PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003245 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003246 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3247}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003248
Christian Heimes5894ba72007-11-04 11:43:14 +00003249PyObject*
3250PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3251{
Victor Stinner99b95382011-07-04 14:23:54 +02003252#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003253 return PyUnicode_DecodeMBCS(s, size, NULL);
3254#elif defined(__APPLE__)
3255 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3256#else
Victor Stinner793b5312011-04-27 00:24:21 +02003257 PyInterpreterState *interp = PyThreadState_GET()->interp;
3258 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3259 cannot use it to encode and decode filenames before it is loaded. Load
3260 the Python codec requires to encode at least its own filename. Use the C
3261 version of the locale codec until the codec registry is initialized and
3262 the Python codec is loaded.
3263
3264 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3265 cannot only rely on it: check also interp->fscodec_initialized for
3266 subinterpreters. */
3267 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003268 return PyUnicode_Decode(s, size,
3269 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003270 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003271 }
3272 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003273 /* locale encoding with surrogateescape */
3274 wchar_t *wchar;
3275 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003276 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003277
3278 if (s[size] != '\0' || size != strlen(s)) {
3279 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3280 return NULL;
3281 }
3282
Victor Stinner168e1172010-10-16 23:16:16 +00003283 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003284 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003285 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003286
Victor Stinner168e1172010-10-16 23:16:16 +00003287 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003288 PyMem_Free(wchar);
3289 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003290 }
Victor Stinnerad158722010-10-27 00:25:46 +00003291#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003292}
3293
Martin v. Löwis011e8422009-05-05 04:43:17 +00003294
3295int
3296PyUnicode_FSConverter(PyObject* arg, void* addr)
3297{
3298 PyObject *output = NULL;
3299 Py_ssize_t size;
3300 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003301 if (arg == NULL) {
3302 Py_DECREF(*(PyObject**)addr);
3303 return 1;
3304 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003305 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003306 output = arg;
3307 Py_INCREF(output);
3308 }
3309 else {
3310 arg = PyUnicode_FromObject(arg);
3311 if (!arg)
3312 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003313 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003314 Py_DECREF(arg);
3315 if (!output)
3316 return 0;
3317 if (!PyBytes_Check(output)) {
3318 Py_DECREF(output);
3319 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3320 return 0;
3321 }
3322 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003323 size = PyBytes_GET_SIZE(output);
3324 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003325 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003326 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003327 Py_DECREF(output);
3328 return 0;
3329 }
3330 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003331 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003332}
3333
3334
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003335int
3336PyUnicode_FSDecoder(PyObject* arg, void* addr)
3337{
3338 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003339 if (arg == NULL) {
3340 Py_DECREF(*(PyObject**)addr);
3341 return 1;
3342 }
3343 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344 if (PyUnicode_READY(arg))
3345 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003346 output = arg;
3347 Py_INCREF(output);
3348 }
3349 else {
3350 arg = PyBytes_FromObject(arg);
3351 if (!arg)
3352 return 0;
3353 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3354 PyBytes_GET_SIZE(arg));
3355 Py_DECREF(arg);
3356 if (!output)
3357 return 0;
3358 if (!PyUnicode_Check(output)) {
3359 Py_DECREF(output);
3360 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3361 return 0;
3362 }
3363 }
Victor Stinner065836e2011-10-27 01:56:33 +02003364 if (PyUnicode_READY(output) < 0) {
3365 Py_DECREF(output);
3366 return 0;
3367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003368 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003369 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003370 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3371 Py_DECREF(output);
3372 return 0;
3373 }
3374 *(PyObject**)addr = output;
3375 return Py_CLEANUP_SUPPORTED;
3376}
3377
3378
Martin v. Löwis5b222132007-06-10 09:51:05 +00003379char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003380PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003381{
Christian Heimesf3863112007-11-22 07:46:41 +00003382 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003383
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003384 if (!PyUnicode_Check(unicode)) {
3385 PyErr_BadArgument();
3386 return NULL;
3387 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003388 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003389 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003390
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003391 if (PyUnicode_UTF8(unicode) == NULL) {
3392 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3394 if (bytes == NULL)
3395 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003396 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3397 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 Py_DECREF(bytes);
3399 return NULL;
3400 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003401 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3402 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3403 PyBytes_AS_STRING(bytes),
3404 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003405 Py_DECREF(bytes);
3406 }
3407
3408 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003409 *psize = PyUnicode_UTF8_LENGTH(unicode);
3410 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003411}
3412
3413char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003416 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3417}
3418
3419#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003420static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421#endif
3422
3423
3424Py_UNICODE *
3425PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003427 const unsigned char *one_byte;
3428#if SIZEOF_WCHAR_T == 4
3429 const Py_UCS2 *two_bytes;
3430#else
3431 const Py_UCS4 *four_bytes;
3432 const Py_UCS4 *ucs4_end;
3433 Py_ssize_t num_surrogates;
3434#endif
3435 wchar_t *w;
3436 wchar_t *wchar_end;
3437
3438 if (!PyUnicode_Check(unicode)) {
3439 PyErr_BadArgument();
3440 return NULL;
3441 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003442 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003443 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003444 assert(_PyUnicode_KIND(unicode) != 0);
3445 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003446
3447#ifdef Py_DEBUG
3448 ++unicode_as_unicode_calls;
3449#endif
3450
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003451 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003452#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003453 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3454 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003455 num_surrogates = 0;
3456
3457 for (; four_bytes < ucs4_end; ++four_bytes) {
3458 if (*four_bytes > 0xFFFF)
3459 ++num_surrogates;
3460 }
3461
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003462 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3463 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3464 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465 PyErr_NoMemory();
3466 return NULL;
3467 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003468 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003470 w = _PyUnicode_WSTR(unicode);
3471 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3472 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003473 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3474 if (*four_bytes > 0xFFFF) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01003475 assert(*four_bytes <= 0x10FFFF);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003477 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3478 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 }
3480 else
3481 *w = *four_bytes;
3482
3483 if (w > wchar_end) {
3484 assert(0 && "Miscalculated string end");
3485 }
3486 }
3487 *w = 0;
3488#else
3489 /* sizeof(wchar_t) == 4 */
3490 Py_FatalError("Impossible unicode object state, wstr and str "
3491 "should share memory already.");
3492 return NULL;
3493#endif
3494 }
3495 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003496 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3497 (_PyUnicode_LENGTH(unicode) + 1));
3498 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003499 PyErr_NoMemory();
3500 return NULL;
3501 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003502 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3503 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3504 w = _PyUnicode_WSTR(unicode);
3505 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003506
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003507 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3508 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003509 for (; w < wchar_end; ++one_byte, ++w)
3510 *w = *one_byte;
3511 /* null-terminate the wstr */
3512 *w = 0;
3513 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003514 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003515#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003516 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003517 for (; w < wchar_end; ++two_bytes, ++w)
3518 *w = *two_bytes;
3519 /* null-terminate the wstr */
3520 *w = 0;
3521#else
3522 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003523 PyObject_FREE(_PyUnicode_WSTR(unicode));
3524 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003525 Py_FatalError("Impossible unicode object state, wstr "
3526 "and str should share memory already.");
3527 return NULL;
3528#endif
3529 }
3530 else {
3531 assert(0 && "This should never happen.");
3532 }
3533 }
3534 }
3535 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003536 *size = PyUnicode_WSTR_LENGTH(unicode);
3537 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003538}
3539
Alexander Belopolsky40018472011-02-26 01:02:56 +00003540Py_UNICODE *
3541PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003543 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544}
3545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003546
Alexander Belopolsky40018472011-02-26 01:02:56 +00003547Py_ssize_t
3548PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549{
3550 if (!PyUnicode_Check(unicode)) {
3551 PyErr_BadArgument();
3552 goto onError;
3553 }
3554 return PyUnicode_GET_SIZE(unicode);
3555
Benjamin Peterson29060642009-01-31 22:14:21 +00003556 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 return -1;
3558}
3559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003560Py_ssize_t
3561PyUnicode_GetLength(PyObject *unicode)
3562{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003563 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003564 PyErr_BadArgument();
3565 return -1;
3566 }
3567
3568 return PyUnicode_GET_LENGTH(unicode);
3569}
3570
3571Py_UCS4
3572PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3573{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003574 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3575 PyErr_BadArgument();
3576 return (Py_UCS4)-1;
3577 }
3578 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3579 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003580 return (Py_UCS4)-1;
3581 }
3582 return PyUnicode_READ_CHAR(unicode, index);
3583}
3584
3585int
3586PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3587{
3588 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003589 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003590 return -1;
3591 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003592 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3593 PyErr_SetString(PyExc_IndexError, "string index out of range");
3594 return -1;
3595 }
3596 if (_PyUnicode_Dirty(unicode))
3597 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003598 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3599 index, ch);
3600 return 0;
3601}
3602
Alexander Belopolsky40018472011-02-26 01:02:56 +00003603const char *
3604PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003605{
Victor Stinner42cb4622010-09-01 19:39:01 +00003606 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003607}
3608
Victor Stinner554f3f02010-06-16 23:33:54 +00003609/* create or adjust a UnicodeDecodeError */
3610static void
3611make_decode_exception(PyObject **exceptionObject,
3612 const char *encoding,
3613 const char *input, Py_ssize_t length,
3614 Py_ssize_t startpos, Py_ssize_t endpos,
3615 const char *reason)
3616{
3617 if (*exceptionObject == NULL) {
3618 *exceptionObject = PyUnicodeDecodeError_Create(
3619 encoding, input, length, startpos, endpos, reason);
3620 }
3621 else {
3622 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3623 goto onError;
3624 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3625 goto onError;
3626 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3627 goto onError;
3628 }
3629 return;
3630
3631onError:
3632 Py_DECREF(*exceptionObject);
3633 *exceptionObject = NULL;
3634}
3635
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636/* error handling callback helper:
3637 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003638 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 and adjust various state variables.
3640 return 0 on success, -1 on error
3641*/
3642
Alexander Belopolsky40018472011-02-26 01:02:56 +00003643static int
3644unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003645 const char *encoding, const char *reason,
3646 const char **input, const char **inend, Py_ssize_t *startinpos,
3647 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003648 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003650 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651
3652 PyObject *restuple = NULL;
3653 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003654 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003655 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003656 Py_ssize_t requiredsize;
3657 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003658 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 int res = -1;
3660
Victor Stinner596a6c42011-11-09 00:02:18 +01003661 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3662 outsize = PyUnicode_GET_LENGTH(*output);
3663 else
3664 outsize = _PyUnicode_WSTR_LENGTH(*output);
3665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 *errorHandler = PyCodec_LookupError(errors);
3668 if (*errorHandler == NULL)
3669 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 }
3671
Victor Stinner554f3f02010-06-16 23:33:54 +00003672 make_decode_exception(exceptionObject,
3673 encoding,
3674 *input, *inend - *input,
3675 *startinpos, *endinpos,
3676 reason);
3677 if (*exceptionObject == NULL)
3678 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679
3680 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3681 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003682 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003684 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 }
3687 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003689 if (PyUnicode_READY(repunicode) < 0)
3690 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003691
3692 /* Copy back the bytes variables, which might have been modified by the
3693 callback */
3694 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3695 if (!inputobj)
3696 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003697 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003699 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003700 *input = PyBytes_AS_STRING(inputobj);
3701 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003702 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003703 /* we can DECREF safely, as the exception has another reference,
3704 so the object won't go away. */
3705 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003706
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003708 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003709 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3711 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003712 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713
Victor Stinner596a6c42011-11-09 00:02:18 +01003714 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3715 /* need more space? (at least enough for what we
3716 have+the replacement+the rest of the string (starting
3717 at the new input position), so we won't have to check space
3718 when there are no errors in the rest of the string) */
3719 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3720 requiredsize = *outpos + replen + insize-newpos;
3721 if (requiredsize > outsize) {
3722 if (requiredsize<2*outsize)
3723 requiredsize = 2*outsize;
3724 if (unicode_resize(output, requiredsize) < 0)
3725 goto onError;
3726 }
3727 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003729 copy_characters(*output, *outpos, repunicode, 0, replen);
3730 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003732 else {
3733 wchar_t *repwstr;
3734 Py_ssize_t repwlen;
3735 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3736 if (repwstr == NULL)
3737 goto onError;
3738 /* need more space? (at least enough for what we
3739 have+the replacement+the rest of the string (starting
3740 at the new input position), so we won't have to check space
3741 when there are no errors in the rest of the string) */
3742 requiredsize = *outpos + repwlen + insize-newpos;
3743 if (requiredsize > outsize) {
3744 if (requiredsize < 2*outsize)
3745 requiredsize = 2*outsize;
3746 if (unicode_resize(output, requiredsize) < 0)
3747 goto onError;
3748 }
3749 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3750 *outpos += repwlen;
3751 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003753 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 /* we made it! */
3756 res = 0;
3757
Benjamin Peterson29060642009-01-31 22:14:21 +00003758 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 Py_XDECREF(restuple);
3760 return res;
3761}
3762
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003763/* --- UTF-7 Codec -------------------------------------------------------- */
3764
Antoine Pitrou244651a2009-05-04 18:56:13 +00003765/* See RFC2152 for details. We encode conservatively and decode liberally. */
3766
3767/* Three simple macros defining base-64. */
3768
3769/* Is c a base-64 character? */
3770
3771#define IS_BASE64(c) \
3772 (((c) >= 'A' && (c) <= 'Z') || \
3773 ((c) >= 'a' && (c) <= 'z') || \
3774 ((c) >= '0' && (c) <= '9') || \
3775 (c) == '+' || (c) == '/')
3776
3777/* given that c is a base-64 character, what is its base-64 value? */
3778
3779#define FROM_BASE64(c) \
3780 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3781 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3782 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3783 (c) == '+' ? 62 : 63)
3784
3785/* What is the base-64 character of the bottom 6 bits of n? */
3786
3787#define TO_BASE64(n) \
3788 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3789
3790/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3791 * decoded as itself. We are permissive on decoding; the only ASCII
3792 * byte not decoding to itself is the + which begins a base64
3793 * string. */
3794
3795#define DECODE_DIRECT(c) \
3796 ((c) <= 127 && (c) != '+')
3797
3798/* The UTF-7 encoder treats ASCII characters differently according to
3799 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3800 * the above). See RFC2152. This array identifies these different
3801 * sets:
3802 * 0 : "Set D"
3803 * alphanumeric and '(),-./:?
3804 * 1 : "Set O"
3805 * !"#$%&*;<=>@[]^_`{|}
3806 * 2 : "whitespace"
3807 * ht nl cr sp
3808 * 3 : special (must be base64 encoded)
3809 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3810 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003811
Tim Petersced69f82003-09-16 20:30:58 +00003812static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003813char utf7_category[128] = {
3814/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3815 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3816/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3817 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3818/* sp ! " # $ % & ' ( ) * + , - . / */
3819 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3820/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3821 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3822/* @ A B C D E F G H I J K L M N O */
3823 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3824/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3826/* ` a b c d e f g h i j k l m n o */
3827 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3828/* p q r s t u v w x y z { | } ~ del */
3829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003830};
3831
Antoine Pitrou244651a2009-05-04 18:56:13 +00003832/* ENCODE_DIRECT: this character should be encoded as itself. The
3833 * answer depends on whether we are encoding set O as itself, and also
3834 * on whether we are encoding whitespace as itself. RFC2152 makes it
3835 * clear that the answers to these questions vary between
3836 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003837
Antoine Pitrou244651a2009-05-04 18:56:13 +00003838#define ENCODE_DIRECT(c, directO, directWS) \
3839 ((c) < 128 && (c) > 0 && \
3840 ((utf7_category[(c)] == 0) || \
3841 (directWS && (utf7_category[(c)] == 2)) || \
3842 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003843
Alexander Belopolsky40018472011-02-26 01:02:56 +00003844PyObject *
3845PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003846 Py_ssize_t size,
3847 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003848{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003849 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3850}
3851
Antoine Pitrou244651a2009-05-04 18:56:13 +00003852/* The decoder. The only state we preserve is our read position,
3853 * i.e. how many characters we have consumed. So if we end in the
3854 * middle of a shift sequence we have to back off the read position
3855 * and the output to the beginning of the sequence, otherwise we lose
3856 * all the shift state (seen bits, number of bits seen, high
3857 * surrogate). */
3858
Alexander Belopolsky40018472011-02-26 01:02:56 +00003859PyObject *
3860PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003861 Py_ssize_t size,
3862 const char *errors,
3863 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003864{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003866 Py_ssize_t startinpos;
3867 Py_ssize_t endinpos;
3868 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003869 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003870 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003871 const char *errmsg = "";
3872 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003873 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003874 unsigned int base64bits = 0;
3875 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003876 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 PyObject *errorHandler = NULL;
3878 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003879
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003880 /* Start off assuming it's all ASCII. Widen later as necessary. */
3881 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003882 if (!unicode)
3883 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003884 if (size == 0) {
3885 if (consumed)
3886 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003887 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003888 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003889
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003890 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003891 e = s + size;
3892
3893 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003894 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003895 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003896 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003897
Antoine Pitrou244651a2009-05-04 18:56:13 +00003898 if (inShift) { /* in a base-64 section */
3899 if (IS_BASE64(ch)) { /* consume a base-64 character */
3900 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3901 base64bits += 6;
3902 s++;
3903 if (base64bits >= 16) {
3904 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003905 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003906 base64bits -= 16;
3907 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3908 if (surrogate) {
3909 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003910 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3911 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003912 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3913 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003914 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003915 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003916 }
3917 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003918 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3919 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921 }
3922 }
Victor Stinner551ac952011-11-29 22:58:13 +01003923 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 /* first surrogate */
3925 surrogate = outCh;
3926 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003927 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003928 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3929 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 }
3931 }
3932 }
3933 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003934 inShift = 0;
3935 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003937 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3938 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003939 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003940 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003941 if (base64bits > 0) { /* left-over bits */
3942 if (base64bits >= 6) {
3943 /* We've seen at least one base-64 character */
3944 errmsg = "partial character in shift sequence";
3945 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003947 else {
3948 /* Some bits remain; they should be zero */
3949 if (base64buffer != 0) {
3950 errmsg = "non-zero padding bits in shift sequence";
3951 goto utf7Error;
3952 }
3953 }
3954 }
3955 if (ch != '-') {
3956 /* '-' is absorbed; other terminating
3957 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003958 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3959 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003960 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003961 }
3962 }
3963 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003965 s++; /* consume '+' */
3966 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3969 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003970 }
3971 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003972 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003973 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003974 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 }
3976 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003978 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3979 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 s++;
3981 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003982 else {
3983 startinpos = s-starts;
3984 s++;
3985 errmsg = "unexpected special character";
3986 goto utf7Error;
3987 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003988 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003989utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 endinpos = s-starts;
3991 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 errors, &errorHandler,
3993 "utf7", errmsg,
3994 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003995 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003997 }
3998
Antoine Pitrou244651a2009-05-04 18:56:13 +00003999 /* end of string */
4000
4001 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4002 /* if we're in an inconsistent state, that's an error */
4003 if (surrogate ||
4004 (base64bits >= 6) ||
4005 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004006 endinpos = size;
4007 if (unicode_decode_call_errorhandler(
4008 errors, &errorHandler,
4009 "utf7", "unterminated shift sequence",
4010 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004011 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012 goto onError;
4013 if (s < e)
4014 goto restart;
4015 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004017
4018 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004019 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004020 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004021 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004022 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023 }
4024 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004025 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004026 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004027 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004028
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004029 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030 goto onError;
4031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 Py_XDECREF(errorHandler);
4033 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004034 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004035
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 Py_XDECREF(errorHandler);
4038 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004039 Py_DECREF(unicode);
4040 return NULL;
4041}
4042
4043
Alexander Belopolsky40018472011-02-26 01:02:56 +00004044PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004045_PyUnicode_EncodeUTF7(PyObject *str,
4046 int base64SetO,
4047 int base64WhiteSpace,
4048 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004049{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004050 int kind;
4051 void *data;
4052 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004053 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004054 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004055 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004056 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004057 unsigned int base64bits = 0;
4058 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059 char * out;
4060 char * start;
4061
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004062 if (PyUnicode_READY(str) < 0)
4063 return NULL;
4064 kind = PyUnicode_KIND(str);
4065 data = PyUnicode_DATA(str);
4066 len = PyUnicode_GET_LENGTH(str);
4067
4068 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004070
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004071 /* It might be possible to tighten this worst case */
4072 allocated = 8 * len;
4073 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004074 return PyErr_NoMemory();
4075
Antoine Pitrou244651a2009-05-04 18:56:13 +00004076 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004077 if (v == NULL)
4078 return NULL;
4079
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004080 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004081 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004082 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004083
Antoine Pitrou244651a2009-05-04 18:56:13 +00004084 if (inShift) {
4085 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4086 /* shifting out */
4087 if (base64bits) { /* output remaining bits */
4088 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4089 base64buffer = 0;
4090 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004091 }
4092 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004093 /* Characters not in the BASE64 set implicitly unshift the sequence
4094 so no '-' is required, except if the character is itself a '-' */
4095 if (IS_BASE64(ch) || ch == '-') {
4096 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004097 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004098 *out++ = (char) ch;
4099 }
4100 else {
4101 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004102 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004103 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004104 else { /* not in a shift sequence */
4105 if (ch == '+') {
4106 *out++ = '+';
4107 *out++ = '-';
4108 }
4109 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4110 *out++ = (char) ch;
4111 }
4112 else {
4113 *out++ = '+';
4114 inShift = 1;
4115 goto encode_char;
4116 }
4117 }
4118 continue;
4119encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004120 if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01004121 assert(ch <= 0x10FFFF);
4122
Antoine Pitrou244651a2009-05-04 18:56:13 +00004123 /* code first surrogate */
4124 base64bits += 16;
4125 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4126 while (base64bits >= 6) {
4127 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4128 base64bits -= 6;
4129 }
4130 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004131 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004132 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004133 base64bits += 16;
4134 base64buffer = (base64buffer << 16) | ch;
4135 while (base64bits >= 6) {
4136 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4137 base64bits -= 6;
4138 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004139 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004140 if (base64bits)
4141 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4142 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004143 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004144 if (_PyBytes_Resize(&v, out - start) < 0)
4145 return NULL;
4146 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004147}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004148PyObject *
4149PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4150 Py_ssize_t size,
4151 int base64SetO,
4152 int base64WhiteSpace,
4153 const char *errors)
4154{
4155 PyObject *result;
4156 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4157 if (tmp == NULL)
4158 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004159 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004160 base64WhiteSpace, errors);
4161 Py_DECREF(tmp);
4162 return result;
4163}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004164
Antoine Pitrou244651a2009-05-04 18:56:13 +00004165#undef IS_BASE64
4166#undef FROM_BASE64
4167#undef TO_BASE64
4168#undef DECODE_DIRECT
4169#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171/* --- UTF-8 Codec -------------------------------------------------------- */
4172
Tim Petersced69f82003-09-16 20:30:58 +00004173static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004175 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4176 illegal prefix. See RFC 3629 for details */
4177 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4178 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004179 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4181 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4189 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4190 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4191 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4192 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193};
4194
Alexander Belopolsky40018472011-02-26 01:02:56 +00004195PyObject *
4196PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004197 Py_ssize_t size,
4198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199{
Walter Dörwald69652032004-09-07 20:24:22 +00004200 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4201}
4202
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004203#include "stringlib/ucs1lib.h"
4204#include "stringlib/codecs.h"
4205#include "stringlib/undef.h"
4206
4207#include "stringlib/ucs2lib.h"
4208#include "stringlib/codecs.h"
4209#include "stringlib/undef.h"
4210
4211#include "stringlib/ucs4lib.h"
4212#include "stringlib/codecs.h"
4213#include "stringlib/undef.h"
4214
Antoine Pitrouab868312009-01-10 15:40:25 +00004215/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4216#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4217
4218/* Mask to quickly check whether a C 'long' contains a
4219 non-ASCII, UTF8-encoded char. */
4220#if (SIZEOF_LONG == 8)
4221# define ASCII_CHAR_MASK 0x8080808080808080L
4222#elif (SIZEOF_LONG == 4)
4223# define ASCII_CHAR_MASK 0x80808080L
4224#else
4225# error C 'long' size should be either 4 or 8!
4226#endif
4227
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004228/* Scans a UTF-8 string and returns the maximum character to be expected
4229 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004231 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233 */
4234static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004235utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4236 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 const unsigned char *p = (const unsigned char *)s;
4240 const unsigned char *end = p + string_size;
4241 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004243 assert(unicode_size != NULL);
4244
4245 /* By having a cascade of independent loops which fallback onto each
4246 other, we minimize the amount of work done in the average loop
4247 iteration, and we also maximize the CPU's ability to predict
4248 branches correctly (because a given condition will have always the
4249 same boolean outcome except perhaps in the last iteration of the
4250 corresponding loop).
4251 In the general case this brings us rather close to decoding
4252 performance pre-PEP 393, despite the two-pass decoding.
4253
4254 Note that the pure ASCII loop is not duplicated once a non-ASCII
4255 character has been encountered. It is actually a pessimization (by
4256 a significant factor) to use this loop on text with many non-ASCII
4257 characters, and it is important to avoid bad performance on valid
4258 utf-8 data (invalid utf-8 being a different can of worms).
4259 */
4260
4261 /* ASCII */
4262 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 /* Only check value if it's not a ASCII char... */
4264 if (*p < 0x80) {
4265 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4266 an explanation. */
4267 if (!((size_t) p & LONG_PTR_MASK)) {
4268 /* Help register allocation */
4269 register const unsigned char *_p = p;
4270 while (_p < aligned_end) {
4271 unsigned long value = *(unsigned long *) _p;
4272 if (value & ASCII_CHAR_MASK)
4273 break;
4274 _p += SIZEOF_LONG;
4275 char_count += SIZEOF_LONG;
4276 }
4277 p = _p;
4278 if (p == end)
4279 break;
4280 }
4281 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004282 if (*p < 0x80)
4283 ++char_count;
4284 else
4285 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004287 *unicode_size = char_count;
4288 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004290_ucs1loop:
4291 for (; p < end; ++p) {
4292 if (*p < 0xc4)
4293 char_count += ((*p & 0xc0) != 0x80);
4294 else
4295 goto _ucs2loop;
4296 }
4297 *unicode_size = char_count;
4298 return 255;
4299
4300_ucs2loop:
4301 for (; p < end; ++p) {
4302 if (*p < 0xf0)
4303 char_count += ((*p & 0xc0) != 0x80);
4304 else
4305 goto _ucs4loop;
4306 }
4307 *unicode_size = char_count;
4308 return 65535;
4309
4310_ucs4loop:
4311 for (; p < end; ++p) {
4312 char_count += ((*p & 0xc0) != 0x80);
4313 }
4314 *unicode_size = char_count;
4315 return 65537;
4316}
4317
4318/* Called when we encountered some error that wasn't detected in the original
4319 scan, e.g. an encoded surrogate character. The original maxchar computation
4320 may have been incorrect, so redo it. */
4321static int
4322refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4323{
4324 PyObject *tmp;
Victor Stinnerf8facac2011-11-22 02:30:47 +01004325 Py_ssize_t k;
4326 Py_UCS4 maxchar;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004327 for (k = 0, maxchar = 0; k < n; k++)
4328 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4329 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4330 if (tmp == NULL)
4331 return -1;
4332 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4333 Py_DECREF(*unicode);
4334 *unicode = tmp;
4335 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336}
4337
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004338/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4339 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4340 onError. Potential resizing overallocates, so the result needs to shrink
4341 at the end.
4342*/
4343#define WRITE_MAYBE_FAIL(index, value) \
4344 do { \
4345 if (has_errors) { \
4346 Py_ssize_t pos = index; \
4347 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4348 unicode_resize(&unicode, pos + pos/8) < 0) \
4349 goto onError; \
4350 if (unicode_putchar(&unicode, &pos, value) < 0) \
4351 goto onError; \
4352 } \
4353 else \
4354 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004355 } while (0)
4356
Alexander Belopolsky40018472011-02-26 01:02:56 +00004357PyObject *
4358PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004359 Py_ssize_t size,
4360 const char *errors,
4361 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004362{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004365 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004366 Py_ssize_t startinpos;
4367 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004368 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004369 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004370 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 PyObject *errorHandler = NULL;
4372 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004373 Py_UCS4 maxchar = 0;
4374 Py_ssize_t unicode_size;
4375 Py_ssize_t i;
4376 int kind;
4377 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004378 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379
Walter Dörwald69652032004-09-07 20:24:22 +00004380 if (size == 0) {
4381 if (consumed)
4382 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004383 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004384 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004385 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004386 /* When the string is ASCII only, just use memcpy and return.
4387 unicode_size may be != size if there is an incomplete UTF-8
4388 sequence at the end of the ASCII block. */
4389 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004390 if (consumed)
4391 *consumed = size;
4392
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004393 if (size == 1)
4394 return get_latin1_char((unsigned char)s[0]);
4395
4396 unicode = PyUnicode_New(unicode_size, maxchar);
4397 if (!unicode)
4398 return NULL;
4399 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4400 assert(_PyUnicode_CheckConsistency(unicode, 1));
4401 return unicode;
4402 }
4403
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004404 /* In case of errors, maxchar and size computation might be incorrect;
4405 code below refits and resizes as necessary. */
4406 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 if (!unicode)
4408 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004409 kind = PyUnicode_KIND(unicode);
4410 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004411
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004413 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004415 switch (kind) {
4416 case PyUnicode_1BYTE_KIND:
4417 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4418 break;
4419 case PyUnicode_2BYTE_KIND:
4420 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4421 break;
4422 case PyUnicode_4BYTE_KIND:
4423 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4424 break;
4425 }
4426 if (!has_errors) {
4427 /* Ensure the unicode size calculation was correct */
4428 assert(i == unicode_size);
4429 assert(s == e);
4430 if (consumed)
4431 *consumed = s-starts;
4432 return unicode;
4433 }
4434 /* Fall through to the generic decoding loop for the rest of
4435 the string */
4436 if (refit_partial_string(&unicode, kind, data, i) < 0)
4437 goto onError;
4438
Antoine Pitrouab868312009-01-10 15:40:25 +00004439 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440
4441 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004442 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443
4444 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004445 /* Fast path for runs of ASCII characters. Given that common UTF-8
4446 input will consist of an overwhelming majority of ASCII
4447 characters, we try to optimize for this case by checking
4448 as many characters as a C 'long' can contain.
4449 First, check if we can do an aligned read, as most CPUs have
4450 a penalty for unaligned reads.
4451 */
4452 if (!((size_t) s & LONG_PTR_MASK)) {
4453 /* Help register allocation */
4454 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004456 while (_s < aligned_end) {
4457 /* Read a whole long at a time (either 4 or 8 bytes),
4458 and do a fast unrolled copy if it only contains ASCII
4459 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 unsigned long value = *(unsigned long *) _s;
4461 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004462 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004463 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4464 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4465 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4466 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004467#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4469 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4470 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4471 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004472#endif
4473 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004474 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004475 }
4476 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004478 if (s == e)
4479 break;
4480 ch = (unsigned char)*s;
4481 }
4482 }
4483
4484 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004485 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 s++;
4487 continue;
4488 }
4489
4490 n = utf8_code_length[ch];
4491
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004492 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 if (consumed)
4494 break;
4495 else {
4496 errmsg = "unexpected end of data";
4497 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004498 endinpos = startinpos+1;
4499 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4500 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 goto utf8Error;
4502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504
4505 switch (n) {
4506
4507 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004508 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 startinpos = s-starts;
4510 endinpos = startinpos+1;
4511 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512
4513 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004514 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 startinpos = s-starts;
4516 endinpos = startinpos+1;
4517 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518
4519 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004520 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004521 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004523 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 goto utf8Error;
4525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004527 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004528 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529 break;
4530
4531 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004532 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4533 will result in surrogates in range d800-dfff. Surrogates are
4534 not valid UTF-8 so they are rejected.
4535 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4536 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004537 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004538 (s[2] & 0xc0) != 0x80 ||
4539 ((unsigned char)s[0] == 0xE0 &&
4540 (unsigned char)s[1] < 0xA0) ||
4541 ((unsigned char)s[0] == 0xED &&
4542 (unsigned char)s[1] > 0x9F)) {
4543 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004545 endinpos = startinpos + 1;
4546
4547 /* if s[1] first two bits are 1 and 0, then the invalid
4548 continuation byte is s[2], so increment endinpos by 1,
4549 if not, s[1] is invalid and endinpos doesn't need to
4550 be incremented. */
4551 if ((s[1] & 0xC0) == 0x80)
4552 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 goto utf8Error;
4554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004556 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004558 break;
4559
4560 case 4:
4561 if ((s[1] & 0xc0) != 0x80 ||
4562 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004563 (s[3] & 0xc0) != 0x80 ||
4564 ((unsigned char)s[0] == 0xF0 &&
4565 (unsigned char)s[1] < 0x90) ||
4566 ((unsigned char)s[0] == 0xF4 &&
4567 (unsigned char)s[1] > 0x8F)) {
4568 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004570 endinpos = startinpos + 1;
4571 if ((s[1] & 0xC0) == 0x80) {
4572 endinpos++;
4573 if ((s[2] & 0xC0) == 0x80)
4574 endinpos++;
4575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 goto utf8Error;
4577 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004578 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004579 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4580 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4581
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004582 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 }
4585 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004587
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004590 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004591 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004592 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 if (unicode_decode_call_errorhandler(
4595 errors, &errorHandler,
4596 "utf8", errmsg,
4597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004598 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004600 /* Update data because unicode_decode_call_errorhandler might have
4601 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602 data = PyUnicode_DATA(unicode);
4603 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004606 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004607 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004608
Walter Dörwald69652032004-09-07 20:24:22 +00004609 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004612 /* Adjust length and ready string when it contained errors and
4613 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004615 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 goto onError;
4617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 Py_XDECREF(errorHandler);
4620 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004621 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004622 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 Py_XDECREF(errorHandler);
4626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 Py_DECREF(unicode);
4628 return NULL;
4629}
4630
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004631#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004632
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004633#ifdef __APPLE__
4634
4635/* Simplified UTF-8 decoder using surrogateescape error handler,
4636 used to decode the command line arguments on Mac OS X. */
4637
4638wchar_t*
4639_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4640{
4641 int n;
4642 const char *e;
4643 wchar_t *unicode, *p;
4644
4645 /* Note: size will always be longer than the resulting Unicode
4646 character count */
4647 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4648 PyErr_NoMemory();
4649 return NULL;
4650 }
4651 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4652 if (!unicode)
4653 return NULL;
4654
4655 /* Unpack UTF-8 encoded data */
4656 p = unicode;
4657 e = s + size;
4658 while (s < e) {
4659 Py_UCS4 ch = (unsigned char)*s;
4660
4661 if (ch < 0x80) {
4662 *p++ = (wchar_t)ch;
4663 s++;
4664 continue;
4665 }
4666
4667 n = utf8_code_length[ch];
4668 if (s + n > e) {
4669 goto surrogateescape;
4670 }
4671
4672 switch (n) {
4673 case 0:
4674 case 1:
4675 goto surrogateescape;
4676
4677 case 2:
4678 if ((s[1] & 0xc0) != 0x80)
4679 goto surrogateescape;
4680 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4681 assert ((ch > 0x007F) && (ch <= 0x07FF));
4682 *p++ = (wchar_t)ch;
4683 break;
4684
4685 case 3:
4686 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4687 will result in surrogates in range d800-dfff. Surrogates are
4688 not valid UTF-8 so they are rejected.
4689 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4690 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4691 if ((s[1] & 0xc0) != 0x80 ||
4692 (s[2] & 0xc0) != 0x80 ||
4693 ((unsigned char)s[0] == 0xE0 &&
4694 (unsigned char)s[1] < 0xA0) ||
4695 ((unsigned char)s[0] == 0xED &&
4696 (unsigned char)s[1] > 0x9F)) {
4697
4698 goto surrogateescape;
4699 }
4700 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4701 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004703 break;
4704
4705 case 4:
4706 if ((s[1] & 0xc0) != 0x80 ||
4707 (s[2] & 0xc0) != 0x80 ||
4708 (s[3] & 0xc0) != 0x80 ||
4709 ((unsigned char)s[0] == 0xF0 &&
4710 (unsigned char)s[1] < 0x90) ||
4711 ((unsigned char)s[0] == 0xF4 &&
4712 (unsigned char)s[1] > 0x8F)) {
4713 goto surrogateescape;
4714 }
4715 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4716 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4717 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4718
4719#if SIZEOF_WCHAR_T == 4
4720 *p++ = (wchar_t)ch;
4721#else
4722 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004723 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4724 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004725#endif
4726 break;
4727 }
4728 s += n;
4729 continue;
4730
4731 surrogateescape:
4732 *p++ = 0xDC00 + ch;
4733 s++;
4734 }
4735 *p = L'\0';
4736 return unicode;
4737}
4738
4739#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004741/* Primary internal function which creates utf8 encoded bytes objects.
4742
4743 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004744 and allocate exactly as much space needed at the end. Else allocate the
4745 maximum possible needed (4 result bytes per Unicode character), and return
4746 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004747*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004748PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004749_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750{
Tim Peters602f7402002-04-27 18:03:26 +00004751#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004752
Guido van Rossum98297ee2007-11-06 21:34:58 +00004753 Py_ssize_t i; /* index into s of next input byte */
4754 PyObject *result; /* result string object */
4755 char *p; /* next free byte in output buffer */
4756 Py_ssize_t nallocated; /* number of result bytes allocated */
4757 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004758 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004759 PyObject *errorHandler = NULL;
4760 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004761 int kind;
4762 void *data;
4763 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004764 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004766 if (!PyUnicode_Check(unicode)) {
4767 PyErr_BadArgument();
4768 return NULL;
4769 }
4770
4771 if (PyUnicode_READY(unicode) == -1)
4772 return NULL;
4773
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004774 if (PyUnicode_UTF8(unicode))
4775 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4776 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777
4778 kind = PyUnicode_KIND(unicode);
4779 data = PyUnicode_DATA(unicode);
4780 size = PyUnicode_GET_LENGTH(unicode);
4781
Tim Peters602f7402002-04-27 18:03:26 +00004782 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783
Tim Peters602f7402002-04-27 18:03:26 +00004784 if (size <= MAX_SHORT_UNICHARS) {
4785 /* Write into the stack buffer; nallocated can't overflow.
4786 * At the end, we'll allocate exactly as much heap space as it
4787 * turns out we need.
4788 */
4789 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004790 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004791 p = stackbuf;
4792 }
4793 else {
4794 /* Overallocate on the heap, and give the excess back at the end. */
4795 nallocated = size * 4;
4796 if (nallocated / 4 != size) /* overflow! */
4797 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004798 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004799 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004800 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004801 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004802 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004803
Tim Peters602f7402002-04-27 18:03:26 +00004804 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004805 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004806
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004807 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004808 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004810
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004812 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004813 *p++ = (char)(0xc0 | (ch >> 6));
4814 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004815 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 Py_ssize_t repsize, k, startpos;
4818 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004819 rep = unicode_encode_call_errorhandler(
4820 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004821 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004822 if (!rep)
4823 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004825 if (PyBytes_Check(rep))
4826 repsize = PyBytes_GET_SIZE(rep);
4827 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004828 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829
4830 if (repsize > 4) {
4831 Py_ssize_t offset;
4832
4833 if (result == NULL)
4834 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004835 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004838 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4839 /* integer overflow */
4840 PyErr_NoMemory();
4841 goto error;
4842 }
4843 nallocated += repsize - 4;
4844 if (result != NULL) {
4845 if (_PyBytes_Resize(&result, nallocated) < 0)
4846 goto error;
4847 } else {
4848 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004849 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004850 goto error;
4851 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4852 }
4853 p = PyBytes_AS_STRING(result) + offset;
4854 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004856 if (PyBytes_Check(rep)) {
4857 char *prep = PyBytes_AS_STRING(rep);
4858 for(k = repsize; k > 0; k--)
4859 *p++ = *prep++;
4860 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004861 enum PyUnicode_Kind repkind;
4862 void *repdata;
4863
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004864 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004865 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004866 repkind = PyUnicode_KIND(rep);
4867 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868
4869 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004870 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004872 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004873 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004874 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004875 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004876 goto error;
4877 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004878 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004879 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004880 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004881 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004882 } else if (ch < 0x10000) {
4883 *p++ = (char)(0xe0 | (ch >> 12));
4884 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4885 *p++ = (char)(0x80 | (ch & 0x3f));
4886 } else /* ch >= 0x10000 */ {
Victor Stinner0d3721d2011-11-22 03:27:53 +01004887 assert(ch <= 0x10FFFF);
Tim Peters602f7402002-04-27 18:03:26 +00004888 /* Encode UCS4 Unicode ordinals */
4889 *p++ = (char)(0xf0 | (ch >> 18));
4890 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4891 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4892 *p++ = (char)(0x80 | (ch & 0x3f));
4893 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004895
Guido van Rossum98297ee2007-11-06 21:34:58 +00004896 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004897 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004898 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004899 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004900 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004901 }
4902 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004903 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004904 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004905 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004906 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004908
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004909 Py_XDECREF(errorHandler);
4910 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004911 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004912 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004913 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004914 Py_XDECREF(errorHandler);
4915 Py_XDECREF(exc);
4916 Py_XDECREF(result);
4917 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004918
Tim Peters602f7402002-04-27 18:03:26 +00004919#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920}
4921
Alexander Belopolsky40018472011-02-26 01:02:56 +00004922PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004923PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4924 Py_ssize_t size,
4925 const char *errors)
4926{
4927 PyObject *v, *unicode;
4928
4929 unicode = PyUnicode_FromUnicode(s, size);
4930 if (unicode == NULL)
4931 return NULL;
4932 v = _PyUnicode_AsUTF8String(unicode, errors);
4933 Py_DECREF(unicode);
4934 return v;
4935}
4936
4937PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004938PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004940 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941}
4942
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943/* --- UTF-32 Codec ------------------------------------------------------- */
4944
4945PyObject *
4946PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 Py_ssize_t size,
4948 const char *errors,
4949 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950{
4951 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4952}
4953
4954PyObject *
4955PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 Py_ssize_t size,
4957 const char *errors,
4958 int *byteorder,
4959 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004960{
4961 const char *starts = s;
4962 Py_ssize_t startinpos;
4963 Py_ssize_t endinpos;
4964 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004965 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004966 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967 int bo = 0; /* assume native ordering by default */
4968 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004969 /* Offsets from q for retrieving bytes in the right order. */
4970#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4971 int iorder[] = {0, 1, 2, 3};
4972#else
4973 int iorder[] = {3, 2, 1, 0};
4974#endif
4975 PyObject *errorHandler = NULL;
4976 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004977
Walter Dörwald41980ca2007-08-16 21:55:45 +00004978 q = (unsigned char *)s;
4979 e = q + size;
4980
4981 if (byteorder)
4982 bo = *byteorder;
4983
4984 /* Check for BOM marks (U+FEFF) in the input and adjust current
4985 byte order setting accordingly. In native mode, the leading BOM
4986 mark is skipped, in all other modes, it is copied to the output
4987 stream as-is (giving a ZWNBSP character). */
4988 if (bo == 0) {
4989 if (size >= 4) {
4990 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004991 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004992#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 if (bom == 0x0000FEFF) {
4994 q += 4;
4995 bo = -1;
4996 }
4997 else if (bom == 0xFFFE0000) {
4998 q += 4;
4999 bo = 1;
5000 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005001#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 if (bom == 0x0000FEFF) {
5003 q += 4;
5004 bo = 1;
5005 }
5006 else if (bom == 0xFFFE0000) {
5007 q += 4;
5008 bo = -1;
5009 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005010#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005012 }
5013
5014 if (bo == -1) {
5015 /* force LE */
5016 iorder[0] = 0;
5017 iorder[1] = 1;
5018 iorder[2] = 2;
5019 iorder[3] = 3;
5020 }
5021 else if (bo == 1) {
5022 /* force BE */
5023 iorder[0] = 3;
5024 iorder[1] = 2;
5025 iorder[2] = 1;
5026 iorder[3] = 0;
5027 }
5028
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005029 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005030 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005031 if (!unicode)
5032 return NULL;
5033 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005034 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005035 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005036
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 Py_UCS4 ch;
5039 /* remaining bytes at the end? (size should be divisible by 4) */
5040 if (e-q<4) {
5041 if (consumed)
5042 break;
5043 errmsg = "truncated data";
5044 startinpos = ((const char *)q)-starts;
5045 endinpos = ((const char *)e)-starts;
5046 goto utf32Error;
5047 /* The remaining input chars are ignored if the callback
5048 chooses to skip the input */
5049 }
5050 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5051 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 if (ch >= 0x110000)
5054 {
5055 errmsg = "codepoint not in range(0x110000)";
5056 startinpos = ((const char *)q)-starts;
5057 endinpos = startinpos+4;
5058 goto utf32Error;
5059 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005060 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5061 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 q += 4;
5063 continue;
5064 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 if (unicode_decode_call_errorhandler(
5066 errors, &errorHandler,
5067 "utf32", errmsg,
5068 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005069 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071 }
5072
5073 if (byteorder)
5074 *byteorder = bo;
5075
5076 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078
5079 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005080 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081 goto onError;
5082
5083 Py_XDECREF(errorHandler);
5084 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005085 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005088 Py_DECREF(unicode);
5089 Py_XDECREF(errorHandler);
5090 Py_XDECREF(exc);
5091 return NULL;
5092}
5093
5094PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005095_PyUnicode_EncodeUTF32(PyObject *str,
5096 const char *errors,
5097 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005099 int kind;
5100 void *data;
5101 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005102 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005104 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 /* Offsets from p for storing byte pairs in the right order. */
5106#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5107 int iorder[] = {0, 1, 2, 3};
5108#else
5109 int iorder[] = {3, 2, 1, 0};
5110#endif
5111
Benjamin Peterson29060642009-01-31 22:14:21 +00005112#define STORECHAR(CH) \
5113 do { \
5114 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5115 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5116 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5117 p[iorder[0]] = (CH) & 0xff; \
5118 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119 } while(0)
5120
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005121 if (!PyUnicode_Check(str)) {
5122 PyErr_BadArgument();
5123 return NULL;
5124 }
5125 if (PyUnicode_READY(str) < 0)
5126 return NULL;
5127 kind = PyUnicode_KIND(str);
5128 data = PyUnicode_DATA(str);
5129 len = PyUnicode_GET_LENGTH(str);
5130
5131 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005132 bytesize = nsize * 4;
5133 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005135 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005136 if (v == NULL)
5137 return NULL;
5138
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005139 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005140 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005142 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005143 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005144
5145 if (byteorder == -1) {
5146 /* force LE */
5147 iorder[0] = 0;
5148 iorder[1] = 1;
5149 iorder[2] = 2;
5150 iorder[3] = 3;
5151 }
5152 else if (byteorder == 1) {
5153 /* force BE */
5154 iorder[0] = 3;
5155 iorder[1] = 2;
5156 iorder[2] = 1;
5157 iorder[3] = 0;
5158 }
5159
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005160 for (i = 0; i < len; i++)
5161 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005162
5163 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005164 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005165#undef STORECHAR
5166}
5167
Alexander Belopolsky40018472011-02-26 01:02:56 +00005168PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005169PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5170 Py_ssize_t size,
5171 const char *errors,
5172 int byteorder)
5173{
5174 PyObject *result;
5175 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5176 if (tmp == NULL)
5177 return NULL;
5178 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5179 Py_DECREF(tmp);
5180 return result;
5181}
5182
5183PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005184PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005185{
Victor Stinnerb960b342011-11-20 19:12:52 +01005186 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005187}
5188
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189/* --- UTF-16 Codec ------------------------------------------------------- */
5190
Tim Peters772747b2001-08-09 22:21:55 +00005191PyObject *
5192PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 Py_ssize_t size,
5194 const char *errors,
5195 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196{
Walter Dörwald69652032004-09-07 20:24:22 +00005197 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5198}
5199
Antoine Pitrouab868312009-01-10 15:40:25 +00005200/* Two masks for fast checking of whether a C 'long' may contain
5201 UTF16-encoded surrogate characters. This is an efficient heuristic,
5202 assuming that non-surrogate characters with a code point >= 0x8000 are
5203 rare in most input.
5204 FAST_CHAR_MASK is used when the input is in native byte ordering,
5205 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005206*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005207#if (SIZEOF_LONG == 8)
5208# define FAST_CHAR_MASK 0x8000800080008000L
5209# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5210#elif (SIZEOF_LONG == 4)
5211# define FAST_CHAR_MASK 0x80008000L
5212# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5213#else
5214# error C 'long' size should be either 4 or 8!
5215#endif
5216
Walter Dörwald69652032004-09-07 20:24:22 +00005217PyObject *
5218PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 Py_ssize_t size,
5220 const char *errors,
5221 int *byteorder,
5222 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005223{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005224 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005225 Py_ssize_t startinpos;
5226 Py_ssize_t endinpos;
5227 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005228 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005229 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005230 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005231 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005232 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005233 /* Offsets from q for retrieving byte pairs in the right order. */
5234#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5235 int ihi = 1, ilo = 0;
5236#else
5237 int ihi = 0, ilo = 1;
5238#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 PyObject *errorHandler = NULL;
5240 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241
5242 /* Note: size will always be longer than the resulting Unicode
5243 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005244 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 if (!unicode)
5246 return NULL;
5247 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005248 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005249 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250
Tim Peters772747b2001-08-09 22:21:55 +00005251 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005252 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253
5254 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005255 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005257 /* Check for BOM marks (U+FEFF) in the input and adjust current
5258 byte order setting accordingly. In native mode, the leading BOM
5259 mark is skipped, in all other modes, it is copied to the output
5260 stream as-is (giving a ZWNBSP character). */
5261 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005262 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005263 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005264#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 if (bom == 0xFEFF) {
5266 q += 2;
5267 bo = -1;
5268 }
5269 else if (bom == 0xFFFE) {
5270 q += 2;
5271 bo = 1;
5272 }
Tim Petersced69f82003-09-16 20:30:58 +00005273#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 if (bom == 0xFEFF) {
5275 q += 2;
5276 bo = 1;
5277 }
5278 else if (bom == 0xFFFE) {
5279 q += 2;
5280 bo = -1;
5281 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005282#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285
Tim Peters772747b2001-08-09 22:21:55 +00005286 if (bo == -1) {
5287 /* force LE */
5288 ihi = 1;
5289 ilo = 0;
5290 }
5291 else if (bo == 1) {
5292 /* force BE */
5293 ihi = 0;
5294 ilo = 1;
5295 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005296#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5297 native_ordering = ilo < ihi;
5298#else
5299 native_ordering = ilo > ihi;
5300#endif
Tim Peters772747b2001-08-09 22:21:55 +00005301
Antoine Pitrouab868312009-01-10 15:40:25 +00005302 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005303 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005304 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005305 /* First check for possible aligned read of a C 'long'. Unaligned
5306 reads are more expensive, better to defer to another iteration. */
5307 if (!((size_t) q & LONG_PTR_MASK)) {
5308 /* Fast path for runs of non-surrogate chars. */
5309 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005310 int kind = PyUnicode_KIND(unicode);
5311 void *data = PyUnicode_DATA(unicode);
5312 while (_q < aligned_end) {
5313 unsigned long block = * (unsigned long *) _q;
5314 unsigned short *pblock = (unsigned short*)&block;
5315 Py_UCS4 maxch;
5316 if (native_ordering) {
5317 /* Can use buffer directly */
5318 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005319 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005320 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005321 else {
5322 /* Need to byte-swap */
5323 unsigned char *_p = (unsigned char*)pblock;
5324 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005325 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005326 _p[0] = _q[1];
5327 _p[1] = _q[0];
5328 _p[2] = _q[3];
5329 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005330#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005331 _p[4] = _q[5];
5332 _p[5] = _q[4];
5333 _p[6] = _q[7];
5334 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005335#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005336 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005337 maxch = Py_MAX(pblock[0], pblock[1]);
5338#if SIZEOF_LONG == 8
5339 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5340#endif
5341 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5342 if (unicode_widen(&unicode, maxch) < 0)
5343 goto onError;
5344 kind = PyUnicode_KIND(unicode);
5345 data = PyUnicode_DATA(unicode);
5346 }
5347 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5348 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5349#if SIZEOF_LONG == 8
5350 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5351 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5352#endif
5353 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005354 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005355 q = _q;
5356 if (q >= e)
5357 break;
5358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360
Benjamin Peterson14339b62009-01-31 16:36:08 +00005361 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005362
Victor Stinner551ac952011-11-29 22:58:13 +01005363 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005364 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5365 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 continue;
5367 }
5368
5369 /* UTF-16 code pair: */
5370 if (q > e) {
5371 errmsg = "unexpected end of data";
5372 startinpos = (((const char *)q) - 2) - starts;
5373 endinpos = ((const char *)e) + 1 - starts;
5374 goto utf16Error;
5375 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005376 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5377 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005379 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005380 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005381 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005382 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 continue;
5384 }
5385 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005386 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 startinpos = (((const char *)q)-4)-starts;
5388 endinpos = startinpos+2;
5389 goto utf16Error;
5390 }
5391
Benjamin Peterson14339b62009-01-31 16:36:08 +00005392 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 errmsg = "illegal encoding";
5394 startinpos = (((const char *)q)-2)-starts;
5395 endinpos = startinpos+2;
5396 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005397
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005400 errors,
5401 &errorHandler,
5402 "utf16", errmsg,
5403 &starts,
5404 (const char **)&e,
5405 &startinpos,
5406 &endinpos,
5407 &exc,
5408 (const char **)&q,
5409 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005410 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005413 /* remaining byte at the end? (size should be even) */
5414 if (e == q) {
5415 if (!consumed) {
5416 errmsg = "truncated data";
5417 startinpos = ((const char *)q) - starts;
5418 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005419 if (unicode_decode_call_errorhandler(
5420 errors,
5421 &errorHandler,
5422 "utf16", errmsg,
5423 &starts,
5424 (const char **)&e,
5425 &startinpos,
5426 &endinpos,
5427 &exc,
5428 (const char **)&q,
5429 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005430 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005431 goto onError;
5432 /* The remaining input chars are ignored if the callback
5433 chooses to skip the input */
5434 }
5435 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436
5437 if (byteorder)
5438 *byteorder = bo;
5439
Walter Dörwald69652032004-09-07 20:24:22 +00005440 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005444 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 goto onError;
5446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447 Py_XDECREF(errorHandler);
5448 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005449 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453 Py_XDECREF(errorHandler);
5454 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 return NULL;
5456}
5457
Antoine Pitrouab868312009-01-10 15:40:25 +00005458#undef FAST_CHAR_MASK
5459#undef SWAPPED_FAST_CHAR_MASK
5460
Tim Peters772747b2001-08-09 22:21:55 +00005461PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005462_PyUnicode_EncodeUTF16(PyObject *str,
5463 const char *errors,
5464 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005466 int kind;
5467 void *data;
5468 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005469 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005470 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005471 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005472 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005473 /* Offsets from p for storing byte pairs in the right order. */
5474#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5475 int ihi = 1, ilo = 0;
5476#else
5477 int ihi = 0, ilo = 1;
5478#endif
5479
Benjamin Peterson29060642009-01-31 22:14:21 +00005480#define STORECHAR(CH) \
5481 do { \
5482 p[ihi] = ((CH) >> 8) & 0xff; \
5483 p[ilo] = (CH) & 0xff; \
5484 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005485 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005487 if (!PyUnicode_Check(str)) {
5488 PyErr_BadArgument();
5489 return NULL;
5490 }
5491 if (PyUnicode_READY(str) < 0)
5492 return NULL;
5493 kind = PyUnicode_KIND(str);
5494 data = PyUnicode_DATA(str);
5495 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005496
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005497 pairs = 0;
5498 if (kind == PyUnicode_4BYTE_KIND)
5499 for (i = 0; i < len; i++)
5500 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5501 pairs++;
5502 /* 2 * (len + pairs + (byteorder == 0)) */
5503 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005505 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005506 bytesize = nsize * 2;
5507 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005509 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 if (v == NULL)
5511 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005513 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005516 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005517 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005518
5519 if (byteorder == -1) {
5520 /* force LE */
5521 ihi = 1;
5522 ilo = 0;
5523 }
5524 else if (byteorder == 1) {
5525 /* force BE */
5526 ihi = 0;
5527 ilo = 1;
5528 }
5529
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530 for (i = 0; i < len; i++) {
5531 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5532 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005534 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5535 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 }
Tim Peters772747b2001-08-09 22:21:55 +00005537 STORECHAR(ch);
5538 if (ch2)
5539 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005540 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005541
5542 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005543 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005544#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545}
5546
Alexander Belopolsky40018472011-02-26 01:02:56 +00005547PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005548PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5549 Py_ssize_t size,
5550 const char *errors,
5551 int byteorder)
5552{
5553 PyObject *result;
5554 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5555 if (tmp == NULL)
5556 return NULL;
5557 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5558 Py_DECREF(tmp);
5559 return result;
5560}
5561
5562PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005563PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005565 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566}
5567
5568/* --- Unicode Escape Codec ----------------------------------------------- */
5569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5571 if all the escapes in the string make it still a valid ASCII string.
5572 Returns -1 if any escapes were found which cause the string to
5573 pop out of ASCII range. Otherwise returns the length of the
5574 required buffer to hold the string.
5575 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005576static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5578{
5579 const unsigned char *p = (const unsigned char *)s;
5580 const unsigned char *end = p + size;
5581 Py_ssize_t length = 0;
5582
5583 if (size < 0)
5584 return -1;
5585
5586 for (; p < end; ++p) {
5587 if (*p > 127) {
5588 /* Non-ASCII */
5589 return -1;
5590 }
5591 else if (*p != '\\') {
5592 /* Normal character */
5593 ++length;
5594 }
5595 else {
5596 /* Backslash-escape, check next char */
5597 ++p;
5598 /* Escape sequence reaches till end of string or
5599 non-ASCII follow-up. */
5600 if (p >= end || *p > 127)
5601 return -1;
5602 switch (*p) {
5603 case '\n':
5604 /* backslash + \n result in zero characters */
5605 break;
5606 case '\\': case '\'': case '\"':
5607 case 'b': case 'f': case 't':
5608 case 'n': case 'r': case 'v': case 'a':
5609 ++length;
5610 break;
5611 case '0': case '1': case '2': case '3':
5612 case '4': case '5': case '6': case '7':
5613 case 'x': case 'u': case 'U': case 'N':
5614 /* these do not guarantee ASCII characters */
5615 return -1;
5616 default:
5617 /* count the backslash + the other character */
5618 length += 2;
5619 }
5620 }
5621 }
5622 return length;
5623}
5624
Fredrik Lundh06d12682001-01-24 07:59:11 +00005625static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005626
Alexander Belopolsky40018472011-02-26 01:02:56 +00005627PyObject *
5628PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005629 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005630 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005633 Py_ssize_t startinpos;
5634 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005636 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005638 char* message;
5639 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 PyObject *errorHandler = NULL;
5641 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005644
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005645 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646
5647 /* After length_of_escaped_ascii_string() there are two alternatives,
5648 either the string is pure ASCII with named escapes like \n, etc.
5649 and we determined it's exact size (common case)
5650 or it contains \x, \u, ... escape sequences. then we create a
5651 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005652 if (len >= 0) {
5653 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 if (!v)
5655 goto onError;
5656 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005657 }
5658 else {
5659 /* Escaped strings will always be longer than the resulting
5660 Unicode string, so we start with size here and then reduce the
5661 length after conversion to the true value.
5662 (but if the error callback returns a long replacement string
5663 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005664 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665 if (!v)
5666 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005667 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005668 }
5669
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005671 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005672 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005674
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 while (s < end) {
5676 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005677 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680 /* The only case in which i == ascii_length is a backslash
5681 followed by a newline. */
5682 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005683
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 /* Non-escape characters are interpreted as Unicode ordinals */
5685 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005686 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5687 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 continue;
5689 }
5690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 /* \ - Escapes */
5693 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005694 c = *s++;
5695 if (s > end)
5696 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005698 /* The only case in which i == ascii_length is a backslash
5699 followed by a newline. */
5700 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005701
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005702 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005705#define WRITECHAR(ch) \
5706 do { \
5707 if (unicode_putchar(&v, &i, ch) < 0) \
5708 goto onError; \
5709 }while(0)
5710
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712 case '\\': WRITECHAR('\\'); break;
5713 case '\'': WRITECHAR('\''); break;
5714 case '\"': WRITECHAR('\"'); break;
5715 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005717 case 'f': WRITECHAR('\014'); break;
5718 case 't': WRITECHAR('\t'); break;
5719 case 'n': WRITECHAR('\n'); break;
5720 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005721 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005722 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005724 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 case '0': case '1': case '2': case '3':
5728 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005729 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005730 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005731 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005732 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005733 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005735 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 break;
5737
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 /* hex escapes */
5739 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005741 digits = 2;
5742 message = "truncated \\xXX escape";
5743 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005747 digits = 4;
5748 message = "truncated \\uXXXX escape";
5749 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005752 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005753 digits = 8;
5754 message = "truncated \\UXXXXXXXX escape";
5755 hexescape:
5756 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 if (s+digits>end) {
5758 endinpos = size;
5759 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 errors, &errorHandler,
5761 "unicodeescape", "end of string in escape sequence",
5762 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005763 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 goto onError;
5765 goto nextByte;
5766 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005767 for (j = 0; j < digits; ++j) {
5768 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005769 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005770 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 errors, &errorHandler,
5773 "unicodeescape", message,
5774 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005775 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005776 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005779 }
5780 chr = (chr<<4) & ~0xF;
5781 if (c >= '0' && c <= '9')
5782 chr += c - '0';
5783 else if (c >= 'a' && c <= 'f')
5784 chr += 10 + c - 'a';
5785 else
5786 chr += 10 + c - 'A';
5787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005789 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 /* _decoding_error will have already written into the
5791 target buffer. */
5792 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005793 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005794 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005795 if (chr <= 0x10ffff) {
5796 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005797 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 errors, &errorHandler,
5801 "unicodeescape", "illegal Unicode character",
5802 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005803 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005804 goto onError;
5805 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005806 break;
5807
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005809 case 'N':
5810 message = "malformed \\N character escape";
5811 if (ucnhash_CAPI == NULL) {
5812 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005813 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5814 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005815 if (ucnhash_CAPI == NULL)
5816 goto ucnhashError;
5817 }
5818 if (*s == '{') {
5819 const char *start = s+1;
5820 /* look for the closing brace */
5821 while (*s != '}' && s < end)
5822 s++;
5823 if (s > start && s < end && *s == '}') {
5824 /* found a name. look it up in the unicode database */
5825 message = "unknown Unicode character name";
5826 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005827 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005828 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 goto store;
5830 }
5831 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 errors, &errorHandler,
5835 "unicodeescape", message,
5836 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005837 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005838 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 break;
5840
5841 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005842 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 message = "\\ at end of string";
5844 s--;
5845 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 errors, &errorHandler,
5848 "unicodeescape", message,
5849 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005850 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005851 goto onError;
5852 }
5853 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005854 WRITECHAR('\\');
5855 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005856 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005857 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005860 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005862#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005863
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005864 if (PyUnicode_Resize(&v, i) < 0)
5865 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005866 Py_XDECREF(errorHandler);
5867 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005868 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005871 PyErr_SetString(
5872 PyExc_UnicodeError,
5873 "\\N escapes not supported (can't load unicodedata module)"
5874 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005875 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005878 return NULL;
5879
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 Py_XDECREF(errorHandler);
5883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 return NULL;
5885}
5886
5887/* Return a Unicode-Escape string version of the Unicode object.
5888
5889 If quotes is true, the string is enclosed in u"" or u'' quotes as
5890 appropriate.
5891
5892*/
5893
Alexander Belopolsky40018472011-02-26 01:02:56 +00005894PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005898 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900 int kind;
5901 void *data;
5902 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903
Thomas Wouters89f507f2006-12-13 04:49:30 +00005904 /* Initial allocation is based on the longest-possible unichr
5905 escape.
5906
5907 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5908 unichr, so in this case it's the longest unichr escape. In
5909 narrow (UTF-16) builds this is five chars per source unichr
5910 since there are two unichrs in the surrogate pair, so in narrow
5911 (UTF-16) builds it's not the longest unichr escape.
5912
5913 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5914 so in the narrow (UTF-16) build case it's the longest unichr
5915 escape.
5916 */
5917
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005918 if (!PyUnicode_Check(unicode)) {
5919 PyErr_BadArgument();
5920 return NULL;
5921 }
5922 if (PyUnicode_READY(unicode) < 0)
5923 return NULL;
5924 len = PyUnicode_GET_LENGTH(unicode);
5925 kind = PyUnicode_KIND(unicode);
5926 data = PyUnicode_DATA(unicode);
5927 switch(kind) {
5928 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5929 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5930 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5931 }
5932
5933 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005934 return PyBytes_FromStringAndSize(NULL, 0);
5935
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005938
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005939 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 if (repr == NULL)
5944 return NULL;
5945
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005946 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005949 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005950
Walter Dörwald79e913e2007-05-12 11:08:06 +00005951 /* Escape backslashes */
5952 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 *p++ = '\\';
5954 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005955 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005956 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005957
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005958 /* Map 21-bit characters to '\U00xxxxxx' */
5959 else if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01005960 assert(ch <= 0x10FFFF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005961 *p++ = '\\';
5962 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005963 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5964 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5965 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5966 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5967 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5968 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5969 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5970 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005972 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005973
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005975 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 *p++ = '\\';
5977 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005978 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5979 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5980 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5981 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005983
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005984 /* Map special whitespace to '\t', \n', '\r' */
5985 else if (ch == '\t') {
5986 *p++ = '\\';
5987 *p++ = 't';
5988 }
5989 else if (ch == '\n') {
5990 *p++ = '\\';
5991 *p++ = 'n';
5992 }
5993 else if (ch == '\r') {
5994 *p++ = '\\';
5995 *p++ = 'r';
5996 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005997
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005998 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005999 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006001 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006002 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6003 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006004 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006005
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 /* Copy everything else as-is */
6007 else
6008 *p++ = (char) ch;
6009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006011 assert(p - PyBytes_AS_STRING(repr) > 0);
6012 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6013 return NULL;
6014 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015}
6016
Alexander Belopolsky40018472011-02-26 01:02:56 +00006017PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006018PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6019 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006021 PyObject *result;
6022 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6023 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006025 result = PyUnicode_AsUnicodeEscapeString(tmp);
6026 Py_DECREF(tmp);
6027 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
6030/* --- Raw Unicode Escape Codec ------------------------------------------- */
6031
Alexander Belopolsky40018472011-02-26 01:02:56 +00006032PyObject *
6033PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006034 Py_ssize_t size,
6035 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006038 Py_ssize_t startinpos;
6039 Py_ssize_t endinpos;
6040 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006041 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 const char *end;
6043 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 PyObject *errorHandler = NULL;
6045 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006046
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 /* Escaped strings will always be longer than the resulting
6048 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 length after conversion to the true value. (But decoding error
6050 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006051 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006055 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006056 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 end = s + size;
6058 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 unsigned char c;
6060 Py_UCS4 x;
6061 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006062 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 /* Non-escape characters are interpreted as Unicode ordinals */
6065 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006066 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6067 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006069 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 startinpos = s-starts;
6071
6072 /* \u-escapes are only interpreted iff the number of leading
6073 backslashes if odd */
6074 bs = s;
6075 for (;s < end;) {
6076 if (*s != '\\')
6077 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006078 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6079 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 }
6081 if (((s - bs) & 1) == 0 ||
6082 s >= end ||
6083 (*s != 'u' && *s != 'U')) {
6084 continue;
6085 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006086 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 count = *s=='u' ? 4 : 8;
6088 s++;
6089
6090 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 for (x = 0, i = 0; i < count; ++i, ++s) {
6092 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006093 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 endinpos = s-starts;
6095 if (unicode_decode_call_errorhandler(
6096 errors, &errorHandler,
6097 "rawunicodeescape", "truncated \\uXXXX",
6098 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006099 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 goto onError;
6101 goto nextByte;
6102 }
6103 x = (x<<4) & ~0xF;
6104 if (c >= '0' && c <= '9')
6105 x += c - '0';
6106 else if (c >= 'a' && c <= 'f')
6107 x += 10 + c - 'a';
6108 else
6109 x += 10 + c - 'A';
6110 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006111 if (x <= 0x10ffff) {
6112 if (unicode_putchar(&v, &outpos, x) < 0)
6113 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006114 } else {
6115 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006116 if (unicode_decode_call_errorhandler(
6117 errors, &errorHandler,
6118 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006120 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006122 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 nextByte:
6124 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006126 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 Py_XDECREF(errorHandler);
6129 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006130 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006131
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 Py_XDECREF(errorHandler);
6135 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 return NULL;
6137}
6138
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139
Alexander Belopolsky40018472011-02-26 01:02:56 +00006140PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006143 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 char *p;
6145 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 Py_ssize_t expandsize, pos;
6147 int kind;
6148 void *data;
6149 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151 if (!PyUnicode_Check(unicode)) {
6152 PyErr_BadArgument();
6153 return NULL;
6154 }
6155 if (PyUnicode_READY(unicode) < 0)
6156 return NULL;
6157 kind = PyUnicode_KIND(unicode);
6158 data = PyUnicode_DATA(unicode);
6159 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006160 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6161 bytes, and 1 byte characters 4. */
6162 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 if (repr == NULL)
6169 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006171 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006173 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 for (pos = 0; pos < len; pos++) {
6175 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 /* Map 32-bit characters to '\Uxxxxxxxx' */
6177 if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01006178 assert(ch <= 0x10FFFF);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006179 *p++ = '\\';
6180 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006181 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6182 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6183 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6184 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6185 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6186 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6187 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6188 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006189 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 *p++ = '\\';
6193 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006194 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6195 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6197 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 /* Copy everything else as-is */
6200 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 *p++ = (char) ch;
6202 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006203
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006204 assert(p > q);
6205 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006206 return NULL;
6207 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208}
6209
Alexander Belopolsky40018472011-02-26 01:02:56 +00006210PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6212 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006214 PyObject *result;
6215 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6216 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006217 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6219 Py_DECREF(tmp);
6220 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006223/* --- Unicode Internal Codec ------------------------------------------- */
6224
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225PyObject *
6226_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006227 Py_ssize_t size,
6228 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006229{
6230 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006231 Py_ssize_t startinpos;
6232 Py_ssize_t endinpos;
6233 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006234 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006235 const char *end;
6236 const char *reason;
6237 PyObject *errorHandler = NULL;
6238 PyObject *exc = NULL;
6239
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006240 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006241 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006242 1))
6243 return NULL;
6244
Thomas Wouters89f507f2006-12-13 04:49:30 +00006245 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006246 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006249 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006250 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006251 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006252 end = s + size;
6253
6254 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006255 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006256 Py_UCS4 ch;
6257 /* We copy the raw representation one byte at a time because the
6258 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006259 ((char *) &uch)[0] = s[0];
6260 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006261#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006262 ((char *) &uch)[2] = s[2];
6263 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006264#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006265 ch = uch;
6266
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006267 /* We have to sanity check the raw data, otherwise doom looms for
6268 some malformed UCS-4 data. */
6269 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006270#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006271 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006272#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006273 end-s < Py_UNICODE_SIZE
6274 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006276 startinpos = s - starts;
6277 if (end-s < Py_UNICODE_SIZE) {
6278 endinpos = end-starts;
6279 reason = "truncated input";
6280 }
6281 else {
6282 endinpos = s - starts + Py_UNICODE_SIZE;
6283 reason = "illegal code point (> 0x10FFFF)";
6284 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006285 if (unicode_decode_call_errorhandler(
6286 errors, &errorHandler,
6287 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006288 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006289 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006291 continue;
6292 }
6293
6294 s += Py_UNICODE_SIZE;
6295#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006296 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006297 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006298 Py_UNICODE uch2;
6299 ((char *) &uch2)[0] = s[0];
6300 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006301 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006302 {
Victor Stinner551ac952011-11-29 22:58:13 +01006303 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305 }
6306 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006307#endif
6308
6309 if (unicode_putchar(&v, &outpos, ch) < 0)
6310 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006311 }
6312
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006313 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006314 goto onError;
6315 Py_XDECREF(errorHandler);
6316 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006317 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006318
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006320 Py_XDECREF(v);
6321 Py_XDECREF(errorHandler);
6322 Py_XDECREF(exc);
6323 return NULL;
6324}
6325
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326/* --- Latin-1 Codec ------------------------------------------------------ */
6327
Alexander Belopolsky40018472011-02-26 01:02:56 +00006328PyObject *
6329PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006330 Py_ssize_t size,
6331 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006334 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006337/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006338static void
6339make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006340 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006341 PyObject *unicode,
6342 Py_ssize_t startpos, Py_ssize_t endpos,
6343 const char *reason)
6344{
6345 if (*exceptionObject == NULL) {
6346 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006347 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006348 encoding, unicode, startpos, endpos, reason);
6349 }
6350 else {
6351 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6352 goto onError;
6353 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6354 goto onError;
6355 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6356 goto onError;
6357 return;
6358 onError:
6359 Py_DECREF(*exceptionObject);
6360 *exceptionObject = NULL;
6361 }
6362}
6363
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006365static void
6366raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006367 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006368 PyObject *unicode,
6369 Py_ssize_t startpos, Py_ssize_t endpos,
6370 const char *reason)
6371{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006372 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006373 encoding, unicode, startpos, endpos, reason);
6374 if (*exceptionObject != NULL)
6375 PyCodec_StrictErrors(*exceptionObject);
6376}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377
6378/* error handling callback helper:
6379 build arguments, call the callback and check the arguments,
6380 put the result into newpos and return the replacement string, which
6381 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006382static PyObject *
6383unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006384 PyObject **errorHandler,
6385 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006386 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006387 Py_ssize_t startpos, Py_ssize_t endpos,
6388 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006390 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006391 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 PyObject *restuple;
6393 PyObject *resunicode;
6394
6395 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 }
6400
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 if (PyUnicode_READY(unicode) < 0)
6402 return NULL;
6403 len = PyUnicode_GET_LENGTH(unicode);
6404
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006405 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006406 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409
6410 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006415 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 Py_DECREF(restuple);
6417 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006419 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 &resunicode, newpos)) {
6421 Py_DECREF(restuple);
6422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006424 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6425 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6426 Py_DECREF(restuple);
6427 return NULL;
6428 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006429 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006430 *newpos = len + *newpos;
6431 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6433 Py_DECREF(restuple);
6434 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 Py_INCREF(resunicode);
6437 Py_DECREF(restuple);
6438 return resunicode;
6439}
6440
Alexander Belopolsky40018472011-02-26 01:02:56 +00006441static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006443 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006444 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006446 /* input state */
6447 Py_ssize_t pos=0, size;
6448 int kind;
6449 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450 /* output object */
6451 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 /* pointer into the output */
6453 char *str;
6454 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006456 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6457 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 PyObject *errorHandler = NULL;
6459 PyObject *exc = NULL;
6460 /* the following variable is used for caching string comparisons
6461 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6462 int known_errorHandler = -1;
6463
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006464 if (PyUnicode_READY(unicode) < 0)
6465 return NULL;
6466 size = PyUnicode_GET_LENGTH(unicode);
6467 kind = PyUnicode_KIND(unicode);
6468 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469 /* allocate enough for a simple encoding without
6470 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006471 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006472 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006473 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006475 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006476 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 ressize = size;
6478
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 while (pos < size) {
6480 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 /* can we encode this? */
6483 if (c<limit) {
6484 /* no overflow check, because we know that the space is enough */
6485 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 Py_ssize_t requiredsize;
6490 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 Py_ssize_t collstart = pos;
6494 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 ++collend;
6498 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6499 if (known_errorHandler==-1) {
6500 if ((errors==NULL) || (!strcmp(errors, "strict")))
6501 known_errorHandler = 1;
6502 else if (!strcmp(errors, "replace"))
6503 known_errorHandler = 2;
6504 else if (!strcmp(errors, "ignore"))
6505 known_errorHandler = 3;
6506 else if (!strcmp(errors, "xmlcharrefreplace"))
6507 known_errorHandler = 4;
6508 else
6509 known_errorHandler = 0;
6510 }
6511 switch (known_errorHandler) {
6512 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006513 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 goto onError;
6515 case 2: /* replace */
6516 while (collstart++<collend)
6517 *str++ = '?'; /* fall through */
6518 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006519 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 break;
6521 case 4: /* xmlcharrefreplace */
6522 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006523 /* determine replacement size */
6524 for (i = collstart, repsize = 0; i < collend; ++i) {
6525 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6526 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006532 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006536 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006538 else {
6539 assert(ch <= 0x10FFFF);
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006541 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 if (requiredsize > ressize) {
6545 if (requiredsize<2*ressize)
6546 requiredsize = 2*ressize;
6547 if (_PyBytes_Resize(&res, requiredsize))
6548 goto onError;
6549 str = PyBytes_AS_STRING(res) + respos;
6550 ressize = requiredsize;
6551 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 /* generate replacement */
6553 for (i = collstart; i < collend; ++i) {
6554 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 break;
6558 default:
6559 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 encoding, reason, unicode, &exc,
6561 collstart, collend, &newpos);
6562 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6563 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006565 if (PyBytes_Check(repunicode)) {
6566 /* Directly copy bytes result to output. */
6567 repsize = PyBytes_Size(repunicode);
6568 if (repsize > 1) {
6569 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006570 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006571 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6572 Py_DECREF(repunicode);
6573 goto onError;
6574 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006575 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006576 ressize += repsize-1;
6577 }
6578 memcpy(str, PyBytes_AsString(repunicode), repsize);
6579 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006581 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006582 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006583 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 /* need more space? (at least enough for what we
6585 have+the replacement+the rest of the string, so
6586 we won't have to check space for encodable characters) */
6587 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006588 repsize = PyUnicode_GET_LENGTH(repunicode);
6589 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 if (requiredsize > ressize) {
6591 if (requiredsize<2*ressize)
6592 requiredsize = 2*ressize;
6593 if (_PyBytes_Resize(&res, requiredsize)) {
6594 Py_DECREF(repunicode);
6595 goto onError;
6596 }
6597 str = PyBytes_AS_STRING(res) + respos;
6598 ressize = requiredsize;
6599 }
6600 /* check if there is anything unencodable in the replacement
6601 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006602 for (i = 0; repsize-->0; ++i, ++str) {
6603 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006605 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 Py_DECREF(repunicode);
6608 goto onError;
6609 }
6610 *str = (char)c;
6611 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006613 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006614 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006615 }
6616 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006617 /* Resize if we allocated to much */
6618 size = str - PyBytes_AS_STRING(res);
6619 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006620 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006621 if (_PyBytes_Resize(&res, size) < 0)
6622 goto onError;
6623 }
6624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625 Py_XDECREF(errorHandler);
6626 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006627 return res;
6628
6629 onError:
6630 Py_XDECREF(res);
6631 Py_XDECREF(errorHandler);
6632 Py_XDECREF(exc);
6633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006634}
6635
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006636/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006637PyObject *
6638PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006639 Py_ssize_t size,
6640 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 PyObject *result;
6643 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6644 if (unicode == NULL)
6645 return NULL;
6646 result = unicode_encode_ucs1(unicode, errors, 256);
6647 Py_DECREF(unicode);
6648 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649}
6650
Alexander Belopolsky40018472011-02-26 01:02:56 +00006651PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006652_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653{
6654 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 PyErr_BadArgument();
6656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006658 if (PyUnicode_READY(unicode) == -1)
6659 return NULL;
6660 /* Fast path: if it is a one-byte string, construct
6661 bytes object directly. */
6662 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6663 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6664 PyUnicode_GET_LENGTH(unicode));
6665 /* Non-Latin-1 characters present. Defer to above function to
6666 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668}
6669
6670PyObject*
6671PyUnicode_AsLatin1String(PyObject *unicode)
6672{
6673 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
6676/* --- 7-bit ASCII Codec -------------------------------------------------- */
6677
Alexander Belopolsky40018472011-02-26 01:02:56 +00006678PyObject *
6679PyUnicode_DecodeASCII(const char *s,
6680 Py_ssize_t size,
6681 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006684 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006685 int kind;
6686 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006687 Py_ssize_t startinpos;
6688 Py_ssize_t endinpos;
6689 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006691 int has_error;
6692 const unsigned char *p = (const unsigned char *)s;
6693 const unsigned char *end = p + size;
6694 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 PyObject *errorHandler = NULL;
6696 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006697
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006698 if (size == 0) {
6699 Py_INCREF(unicode_empty);
6700 return unicode_empty;
6701 }
6702
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006704 if (size == 1 && (unsigned char)s[0] < 128)
6705 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006706
Victor Stinner702c7342011-10-05 13:50:52 +02006707 has_error = 0;
6708 while (p < end && !has_error) {
6709 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6710 an explanation. */
6711 if (!((size_t) p & LONG_PTR_MASK)) {
6712 /* Help register allocation */
6713 register const unsigned char *_p = p;
6714 while (_p < aligned_end) {
6715 unsigned long value = *(unsigned long *) _p;
6716 if (value & ASCII_CHAR_MASK) {
6717 has_error = 1;
6718 break;
6719 }
6720 _p += SIZEOF_LONG;
6721 }
6722 if (_p == end)
6723 break;
6724 if (has_error)
6725 break;
6726 p = _p;
6727 }
6728 if (*p & 0x80) {
6729 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006730 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006731 }
6732 else {
6733 ++p;
6734 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006735 }
Victor Stinner702c7342011-10-05 13:50:52 +02006736 if (!has_error)
6737 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006738
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006739 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006743 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006744 kind = PyUnicode_KIND(v);
6745 data = PyUnicode_DATA(v);
6746 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 e = s + size;
6748 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 register unsigned char c = (unsigned char)*s;
6750 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006751 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 ++s;
6753 }
6754 else {
6755 startinpos = s-starts;
6756 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 if (unicode_decode_call_errorhandler(
6758 errors, &errorHandler,
6759 "ascii", "ordinal not in range(128)",
6760 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006761 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006763 kind = PyUnicode_KIND(v);
6764 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006767 if (PyUnicode_Resize(&v, outpos) < 0)
6768 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006769 Py_XDECREF(errorHandler);
6770 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006771 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006772 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006776 Py_XDECREF(errorHandler);
6777 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 return NULL;
6779}
6780
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006782PyObject *
6783PyUnicode_EncodeASCII(const Py_UNICODE *p,
6784 Py_ssize_t size,
6785 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006787 PyObject *result;
6788 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6789 if (unicode == NULL)
6790 return NULL;
6791 result = unicode_encode_ucs1(unicode, errors, 128);
6792 Py_DECREF(unicode);
6793 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794}
6795
Alexander Belopolsky40018472011-02-26 01:02:56 +00006796PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006797_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798{
6799 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 PyErr_BadArgument();
6801 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006803 if (PyUnicode_READY(unicode) == -1)
6804 return NULL;
6805 /* Fast path: if it is an ASCII-only string, construct bytes object
6806 directly. Else defer to above function to raise the exception. */
6807 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6808 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6809 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006810 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006811}
6812
6813PyObject *
6814PyUnicode_AsASCIIString(PyObject *unicode)
6815{
6816 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817}
6818
Victor Stinner99b95382011-07-04 14:23:54 +02006819#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006820
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006821/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006822
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006823#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824#define NEED_RETRY
6825#endif
6826
Victor Stinner3a50e702011-10-18 21:21:00 +02006827#ifndef WC_ERR_INVALID_CHARS
6828# define WC_ERR_INVALID_CHARS 0x0080
6829#endif
6830
6831static char*
6832code_page_name(UINT code_page, PyObject **obj)
6833{
6834 *obj = NULL;
6835 if (code_page == CP_ACP)
6836 return "mbcs";
6837 if (code_page == CP_UTF7)
6838 return "CP_UTF7";
6839 if (code_page == CP_UTF8)
6840 return "CP_UTF8";
6841
6842 *obj = PyBytes_FromFormat("cp%u", code_page);
6843 if (*obj == NULL)
6844 return NULL;
6845 return PyBytes_AS_STRING(*obj);
6846}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847
Alexander Belopolsky40018472011-02-26 01:02:56 +00006848static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006849is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850{
6851 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006852 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
Victor Stinner3a50e702011-10-18 21:21:00 +02006854 if (!IsDBCSLeadByteEx(code_page, *curr))
6855 return 0;
6856
6857 prev = CharPrevExA(code_page, s, curr, 0);
6858 if (prev == curr)
6859 return 1;
6860 /* FIXME: This code is limited to "true" double-byte encodings,
6861 as it assumes an incomplete character consists of a single
6862 byte. */
6863 if (curr - prev == 2)
6864 return 1;
6865 if (!IsDBCSLeadByteEx(code_page, *prev))
6866 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867 return 0;
6868}
6869
Victor Stinner3a50e702011-10-18 21:21:00 +02006870static DWORD
6871decode_code_page_flags(UINT code_page)
6872{
6873 if (code_page == CP_UTF7) {
6874 /* The CP_UTF7 decoder only supports flags=0 */
6875 return 0;
6876 }
6877 else
6878 return MB_ERR_INVALID_CHARS;
6879}
6880
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006882 * Decode a byte string from a Windows code page into unicode object in strict
6883 * mode.
6884 *
6885 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6886 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006888static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006889decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006890 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006891 const char *in,
6892 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893{
Victor Stinner3a50e702011-10-18 21:21:00 +02006894 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006895 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006896 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897
6898 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 assert(insize > 0);
6900 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6901 if (outsize <= 0)
6902 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903
6904 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006906 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 if (*v == NULL)
6908 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006909 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006910 }
6911 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006914 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006916 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006917 }
6918
6919 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6921 if (outsize <= 0)
6922 goto error;
6923 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006924
Victor Stinner3a50e702011-10-18 21:21:00 +02006925error:
6926 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6927 return -2;
6928 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006929 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006930}
6931
Victor Stinner3a50e702011-10-18 21:21:00 +02006932/*
6933 * Decode a byte string from a code page into unicode object with an error
6934 * handler.
6935 *
6936 * Returns consumed size if succeed, or raise a WindowsError or
6937 * UnicodeDecodeError exception and returns -1 on error.
6938 */
6939static int
6940decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006941 PyObject **v,
6942 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006943 const char *errors)
6944{
6945 const char *startin = in;
6946 const char *endin = in + size;
6947 const DWORD flags = decode_code_page_flags(code_page);
6948 /* Ideally, we should get reason from FormatMessage. This is the Windows
6949 2000 English version of the message. */
6950 const char *reason = "No mapping for the Unicode character exists "
6951 "in the target code page.";
6952 /* each step cannot decode more than 1 character, but a character can be
6953 represented as a surrogate pair */
6954 wchar_t buffer[2], *startout, *out;
6955 int insize, outsize;
6956 PyObject *errorHandler = NULL;
6957 PyObject *exc = NULL;
6958 PyObject *encoding_obj = NULL;
6959 char *encoding;
6960 DWORD err;
6961 int ret = -1;
6962
6963 assert(size > 0);
6964
6965 encoding = code_page_name(code_page, &encoding_obj);
6966 if (encoding == NULL)
6967 return -1;
6968
6969 if (errors == NULL || strcmp(errors, "strict") == 0) {
6970 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6971 UnicodeDecodeError. */
6972 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6973 if (exc != NULL) {
6974 PyCodec_StrictErrors(exc);
6975 Py_CLEAR(exc);
6976 }
6977 goto error;
6978 }
6979
6980 if (*v == NULL) {
6981 /* Create unicode object */
6982 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6983 PyErr_NoMemory();
6984 goto error;
6985 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006986 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006987 if (*v == NULL)
6988 goto error;
6989 startout = PyUnicode_AS_UNICODE(*v);
6990 }
6991 else {
6992 /* Extend unicode object */
6993 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6994 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6995 PyErr_NoMemory();
6996 goto error;
6997 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006998 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006999 goto error;
7000 startout = PyUnicode_AS_UNICODE(*v) + n;
7001 }
7002
7003 /* Decode the byte string character per character */
7004 out = startout;
7005 while (in < endin)
7006 {
7007 /* Decode a character */
7008 insize = 1;
7009 do
7010 {
7011 outsize = MultiByteToWideChar(code_page, flags,
7012 in, insize,
7013 buffer, Py_ARRAY_LENGTH(buffer));
7014 if (outsize > 0)
7015 break;
7016 err = GetLastError();
7017 if (err != ERROR_NO_UNICODE_TRANSLATION
7018 && err != ERROR_INSUFFICIENT_BUFFER)
7019 {
7020 PyErr_SetFromWindowsErr(0);
7021 goto error;
7022 }
7023 insize++;
7024 }
7025 /* 4=maximum length of a UTF-8 sequence */
7026 while (insize <= 4 && (in + insize) <= endin);
7027
7028 if (outsize <= 0) {
7029 Py_ssize_t startinpos, endinpos, outpos;
7030
7031 startinpos = in - startin;
7032 endinpos = startinpos + 1;
7033 outpos = out - PyUnicode_AS_UNICODE(*v);
7034 if (unicode_decode_call_errorhandler(
7035 errors, &errorHandler,
7036 encoding, reason,
7037 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007038 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 {
7040 goto error;
7041 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007042 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 }
7044 else {
7045 in += insize;
7046 memcpy(out, buffer, outsize * sizeof(wchar_t));
7047 out += outsize;
7048 }
7049 }
7050
7051 /* write a NUL character at the end */
7052 *out = 0;
7053
7054 /* Extend unicode object */
7055 outsize = out - startout;
7056 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007059 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007060
7061error:
7062 Py_XDECREF(encoding_obj);
7063 Py_XDECREF(errorHandler);
7064 Py_XDECREF(exc);
7065 return ret;
7066}
7067
Victor Stinner3a50e702011-10-18 21:21:00 +02007068static PyObject *
7069decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007070 const char *s, Py_ssize_t size,
7071 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072{
Victor Stinner76a31a62011-11-04 00:05:13 +01007073 PyObject *v = NULL;
7074 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007075
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 if (code_page < 0) {
7077 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7078 return NULL;
7079 }
7080
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007083
Victor Stinner76a31a62011-11-04 00:05:13 +01007084 do
7085 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007087 if (size > INT_MAX) {
7088 chunk_size = INT_MAX;
7089 final = 0;
7090 done = 0;
7091 }
7092 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007093#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007094 {
7095 chunk_size = (int)size;
7096 final = (consumed == NULL);
7097 done = 1;
7098 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099
Victor Stinner76a31a62011-11-04 00:05:13 +01007100 /* Skip trailing lead-byte unless 'final' is set */
7101 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7102 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103
Victor Stinner76a31a62011-11-04 00:05:13 +01007104 if (chunk_size == 0 && done) {
7105 if (v != NULL)
7106 break;
7107 Py_INCREF(unicode_empty);
7108 return unicode_empty;
7109 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110
Victor Stinner76a31a62011-11-04 00:05:13 +01007111
7112 converted = decode_code_page_strict(code_page, &v,
7113 s, chunk_size);
7114 if (converted == -2)
7115 converted = decode_code_page_errors(code_page, &v,
7116 s, chunk_size,
7117 errors);
7118 assert(converted != 0);
7119
7120 if (converted < 0) {
7121 Py_XDECREF(v);
7122 return NULL;
7123 }
7124
7125 if (consumed)
7126 *consumed += converted;
7127
7128 s += converted;
7129 size -= converted;
7130 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007131
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007132 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133}
7134
Alexander Belopolsky40018472011-02-26 01:02:56 +00007135PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007136PyUnicode_DecodeCodePageStateful(int code_page,
7137 const char *s,
7138 Py_ssize_t size,
7139 const char *errors,
7140 Py_ssize_t *consumed)
7141{
7142 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7143}
7144
7145PyObject *
7146PyUnicode_DecodeMBCSStateful(const char *s,
7147 Py_ssize_t size,
7148 const char *errors,
7149 Py_ssize_t *consumed)
7150{
7151 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7152}
7153
7154PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007155PyUnicode_DecodeMBCS(const char *s,
7156 Py_ssize_t size,
7157 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007158{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007159 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7160}
7161
Victor Stinner3a50e702011-10-18 21:21:00 +02007162static DWORD
7163encode_code_page_flags(UINT code_page, const char *errors)
7164{
7165 if (code_page == CP_UTF8) {
7166 if (winver.dwMajorVersion >= 6)
7167 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7168 and later */
7169 return WC_ERR_INVALID_CHARS;
7170 else
7171 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7172 return 0;
7173 }
7174 else if (code_page == CP_UTF7) {
7175 /* CP_UTF7 only supports flags=0 */
7176 return 0;
7177 }
7178 else {
7179 if (errors != NULL && strcmp(errors, "replace") == 0)
7180 return 0;
7181 else
7182 return WC_NO_BEST_FIT_CHARS;
7183 }
7184}
7185
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007186/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 * Encode a Unicode string to a Windows code page into a byte string in strict
7188 * mode.
7189 *
7190 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7191 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007192 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007193static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007194encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007195 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197{
Victor Stinner554f3f02010-06-16 23:33:54 +00007198 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 BOOL *pusedDefaultChar = &usedDefaultChar;
7200 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007201 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007202 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007203 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 const DWORD flags = encode_code_page_flags(code_page, NULL);
7205 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007206 /* Create a substring so that we can get the UTF-16 representation
7207 of just the slice under consideration. */
7208 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209
Martin v. Löwis3d325192011-11-04 18:23:06 +01007210 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007211
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007213 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007215 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007216
Victor Stinner2fc507f2011-11-04 20:06:39 +01007217 substring = PyUnicode_Substring(unicode, offset, offset+len);
7218 if (substring == NULL)
7219 return -1;
7220 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7221 if (p == NULL) {
7222 Py_DECREF(substring);
7223 return -1;
7224 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007225
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007226 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 outsize = WideCharToMultiByte(code_page, flags,
7228 p, size,
7229 NULL, 0,
7230 NULL, pusedDefaultChar);
7231 if (outsize <= 0)
7232 goto error;
7233 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007234 if (pusedDefaultChar && *pusedDefaultChar) {
7235 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007237 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007238
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007242 if (*outbytes == NULL) {
7243 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007245 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007247 }
7248 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007249 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 const Py_ssize_t n = PyBytes_Size(*outbytes);
7251 if (outsize > PY_SSIZE_T_MAX - n) {
7252 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007253 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007256 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7257 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007259 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261 }
7262
7263 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 outsize = WideCharToMultiByte(code_page, flags,
7265 p, size,
7266 out, outsize,
7267 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007268 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 if (outsize <= 0)
7270 goto error;
7271 if (pusedDefaultChar && *pusedDefaultChar)
7272 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007273 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007274
Victor Stinner3a50e702011-10-18 21:21:00 +02007275error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007276 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7278 return -2;
7279 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007280 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007281}
7282
Victor Stinner3a50e702011-10-18 21:21:00 +02007283/*
7284 * Encode a Unicode string to a Windows code page into a byte string using a
7285 * error handler.
7286 *
7287 * Returns consumed characters if succeed, or raise a WindowsError and returns
7288 * -1 on other error.
7289 */
7290static int
7291encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007292 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007293 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007294{
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007296 Py_ssize_t pos = unicode_offset;
7297 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 /* Ideally, we should get reason from FormatMessage. This is the Windows
7299 2000 English version of the message. */
7300 const char *reason = "invalid character";
7301 /* 4=maximum length of a UTF-8 sequence */
7302 char buffer[4];
7303 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7304 Py_ssize_t outsize;
7305 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 PyObject *errorHandler = NULL;
7307 PyObject *exc = NULL;
7308 PyObject *encoding_obj = NULL;
7309 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007310 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 PyObject *rep;
7312 int ret = -1;
7313
7314 assert(insize > 0);
7315
7316 encoding = code_page_name(code_page, &encoding_obj);
7317 if (encoding == NULL)
7318 return -1;
7319
7320 if (errors == NULL || strcmp(errors, "strict") == 0) {
7321 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7322 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007323 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 if (exc != NULL) {
7325 PyCodec_StrictErrors(exc);
7326 Py_DECREF(exc);
7327 }
7328 Py_XDECREF(encoding_obj);
7329 return -1;
7330 }
7331
7332 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7333 pusedDefaultChar = &usedDefaultChar;
7334 else
7335 pusedDefaultChar = NULL;
7336
7337 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7338 PyErr_NoMemory();
7339 goto error;
7340 }
7341 outsize = insize * Py_ARRAY_LENGTH(buffer);
7342
7343 if (*outbytes == NULL) {
7344 /* Create string object */
7345 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7346 if (*outbytes == NULL)
7347 goto error;
7348 out = PyBytes_AS_STRING(*outbytes);
7349 }
7350 else {
7351 /* Extend string object */
7352 Py_ssize_t n = PyBytes_Size(*outbytes);
7353 if (n > PY_SSIZE_T_MAX - outsize) {
7354 PyErr_NoMemory();
7355 goto error;
7356 }
7357 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7358 goto error;
7359 out = PyBytes_AS_STRING(*outbytes) + n;
7360 }
7361
7362 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007363 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007365 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7366 wchar_t chars[2];
7367 int charsize;
7368 if (ch < 0x10000) {
7369 chars[0] = (wchar_t)ch;
7370 charsize = 1;
7371 }
7372 else {
7373 ch -= 0x10000;
7374 chars[0] = 0xd800 + (ch >> 10);
7375 chars[1] = 0xdc00 + (ch & 0x3ff);
7376 charsize = 2;
7377 }
7378
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007380 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 buffer, Py_ARRAY_LENGTH(buffer),
7382 NULL, pusedDefaultChar);
7383 if (outsize > 0) {
7384 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7385 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007386 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 memcpy(out, buffer, outsize);
7388 out += outsize;
7389 continue;
7390 }
7391 }
7392 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7393 PyErr_SetFromWindowsErr(0);
7394 goto error;
7395 }
7396
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 rep = unicode_encode_call_errorhandler(
7398 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007399 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007400 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 if (rep == NULL)
7402 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007403 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007404
7405 if (PyBytes_Check(rep)) {
7406 outsize = PyBytes_GET_SIZE(rep);
7407 if (outsize != 1) {
7408 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7409 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7410 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7411 Py_DECREF(rep);
7412 goto error;
7413 }
7414 out = PyBytes_AS_STRING(*outbytes) + offset;
7415 }
7416 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7417 out += outsize;
7418 }
7419 else {
7420 Py_ssize_t i;
7421 enum PyUnicode_Kind kind;
7422 void *data;
7423
7424 if (PyUnicode_READY(rep) < 0) {
7425 Py_DECREF(rep);
7426 goto error;
7427 }
7428
7429 outsize = PyUnicode_GET_LENGTH(rep);
7430 if (outsize != 1) {
7431 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7432 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7433 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7434 Py_DECREF(rep);
7435 goto error;
7436 }
7437 out = PyBytes_AS_STRING(*outbytes) + offset;
7438 }
7439 kind = PyUnicode_KIND(rep);
7440 data = PyUnicode_DATA(rep);
7441 for (i=0; i < outsize; i++) {
7442 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7443 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007444 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007445 encoding, unicode,
7446 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 "unable to encode error handler result to ASCII");
7448 Py_DECREF(rep);
7449 goto error;
7450 }
7451 *out = (unsigned char)ch;
7452 out++;
7453 }
7454 }
7455 Py_DECREF(rep);
7456 }
7457 /* write a NUL byte */
7458 *out = 0;
7459 outsize = out - PyBytes_AS_STRING(*outbytes);
7460 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7461 if (_PyBytes_Resize(outbytes, outsize) < 0)
7462 goto error;
7463 ret = 0;
7464
7465error:
7466 Py_XDECREF(encoding_obj);
7467 Py_XDECREF(errorHandler);
7468 Py_XDECREF(exc);
7469 return ret;
7470}
7471
Victor Stinner3a50e702011-10-18 21:21:00 +02007472static PyObject *
7473encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007474 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 const char *errors)
7476{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007477 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007479 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007480 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007481
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 if (PyUnicode_READY(unicode) < 0)
7483 return NULL;
7484 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007485
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 if (code_page < 0) {
7487 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7488 return NULL;
7489 }
7490
Martin v. Löwis3d325192011-11-04 18:23:06 +01007491 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007492 return PyBytes_FromStringAndSize(NULL, 0);
7493
Victor Stinner7581cef2011-11-03 22:32:33 +01007494 offset = 0;
7495 do
7496 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007497#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007499 chunks. */
7500 if (len > INT_MAX/2) {
7501 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007502 done = 0;
7503 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007504 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007506 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007507 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007508 done = 1;
7509 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007510
Victor Stinner76a31a62011-11-04 00:05:13 +01007511 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007512 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007513 errors);
7514 if (ret == -2)
7515 ret = encode_code_page_errors(code_page, &outbytes,
7516 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007518 if (ret < 0) {
7519 Py_XDECREF(outbytes);
7520 return NULL;
7521 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522
Victor Stinner7581cef2011-11-03 22:32:33 +01007523 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007524 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007525 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 return outbytes;
7528}
7529
7530PyObject *
7531PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7532 Py_ssize_t size,
7533 const char *errors)
7534{
Victor Stinner7581cef2011-11-03 22:32:33 +01007535 PyObject *unicode, *res;
7536 unicode = PyUnicode_FromUnicode(p, size);
7537 if (unicode == NULL)
7538 return NULL;
7539 res = encode_code_page(CP_ACP, unicode, errors);
7540 Py_DECREF(unicode);
7541 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007542}
7543
7544PyObject *
7545PyUnicode_EncodeCodePage(int code_page,
7546 PyObject *unicode,
7547 const char *errors)
7548{
Victor Stinner7581cef2011-11-03 22:32:33 +01007549 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007550}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007551
Alexander Belopolsky40018472011-02-26 01:02:56 +00007552PyObject *
7553PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007554{
7555 if (!PyUnicode_Check(unicode)) {
7556 PyErr_BadArgument();
7557 return NULL;
7558 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007559 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007560}
7561
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007562#undef NEED_RETRY
7563
Victor Stinner99b95382011-07-04 14:23:54 +02007564#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007565
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566/* --- Character Mapping Codec -------------------------------------------- */
7567
Alexander Belopolsky40018472011-02-26 01:02:56 +00007568PyObject *
7569PyUnicode_DecodeCharmap(const char *s,
7570 Py_ssize_t size,
7571 PyObject *mapping,
7572 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007574 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007575 Py_ssize_t startinpos;
7576 Py_ssize_t endinpos;
7577 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007578 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007579 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007581 PyObject *errorHandler = NULL;
7582 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007583
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 /* Default to Latin-1 */
7585 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007588 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007592 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007593 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007595 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007596 Py_ssize_t maplen;
7597 enum PyUnicode_Kind kind;
7598 void *data;
7599 Py_UCS4 x;
7600
7601 if (PyUnicode_READY(mapping) < 0)
7602 return NULL;
7603
7604 maplen = PyUnicode_GET_LENGTH(mapping);
7605 data = PyUnicode_DATA(mapping);
7606 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 while (s < e) {
7608 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007611 x = PyUnicode_READ(kind, data, ch);
7612 else
7613 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007615 if (x == 0xfffe)
7616 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 startinpos = s-starts;
7619 endinpos = startinpos+1;
7620 if (unicode_decode_call_errorhandler(
7621 errors, &errorHandler,
7622 "charmap", "character maps to <undefined>",
7623 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007624 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 goto onError;
7626 }
7627 continue;
7628 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007629
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007630 if (unicode_putchar(&v, &outpos, x) < 0)
7631 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007633 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007634 }
7635 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 while (s < e) {
7637 unsigned char ch = *s;
7638 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007639
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7641 w = PyLong_FromLong((long)ch);
7642 if (w == NULL)
7643 goto onError;
7644 x = PyObject_GetItem(mapping, w);
7645 Py_DECREF(w);
7646 if (x == NULL) {
7647 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7648 /* No mapping found means: mapping is undefined. */
7649 PyErr_Clear();
7650 x = Py_None;
7651 Py_INCREF(x);
7652 } else
7653 goto onError;
7654 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007655
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 /* Apply mapping */
7657 if (PyLong_Check(x)) {
7658 long value = PyLong_AS_LONG(x);
7659 if (value < 0 || value > 65535) {
7660 PyErr_SetString(PyExc_TypeError,
7661 "character mapping must be in range(65536)");
7662 Py_DECREF(x);
7663 goto onError;
7664 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007665 if (unicode_putchar(&v, &outpos, value) < 0)
7666 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 }
7668 else if (x == Py_None) {
7669 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 startinpos = s-starts;
7671 endinpos = startinpos+1;
7672 if (unicode_decode_call_errorhandler(
7673 errors, &errorHandler,
7674 "charmap", "character maps to <undefined>",
7675 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007676 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 Py_DECREF(x);
7678 goto onError;
7679 }
7680 Py_DECREF(x);
7681 continue;
7682 }
7683 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007684 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007685
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007686 if (PyUnicode_READY(x) < 0)
7687 goto onError;
7688 targetsize = PyUnicode_GET_LENGTH(x);
7689
7690 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007692 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007693 PyUnicode_READ_CHAR(x, 0)) < 0)
7694 goto onError;
7695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 else if (targetsize > 1) {
7697 /* 1-n mapping */
7698 if (targetsize > extrachars) {
7699 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 Py_ssize_t needed = (targetsize - extrachars) + \
7701 (targetsize << 2);
7702 extrachars += needed;
7703 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007704 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007705 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 Py_DECREF(x);
7707 goto onError;
7708 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007710 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7711 goto onError;
7712 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7713 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 extrachars -= targetsize;
7715 }
7716 /* 1-0 mapping: skip the character */
7717 }
7718 else {
7719 /* wrong return value */
7720 PyErr_SetString(PyExc_TypeError,
7721 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007722 Py_DECREF(x);
7723 goto onError;
7724 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 Py_DECREF(x);
7726 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007729 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007730 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731 Py_XDECREF(errorHandler);
7732 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007733 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007734
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 Py_XDECREF(errorHandler);
7737 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 Py_XDECREF(v);
7739 return NULL;
7740}
7741
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007742/* Charmap encoding: the lookup table */
7743
Alexander Belopolsky40018472011-02-26 01:02:56 +00007744struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 PyObject_HEAD
7746 unsigned char level1[32];
7747 int count2, count3;
7748 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007749};
7750
7751static PyObject*
7752encoding_map_size(PyObject *obj, PyObject* args)
7753{
7754 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007757}
7758
7759static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007760 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 PyDoc_STR("Return the size (in bytes) of this object") },
7762 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763};
7764
7765static void
7766encoding_map_dealloc(PyObject* o)
7767{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007768 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769}
7770
7771static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 "EncodingMap", /*tp_name*/
7774 sizeof(struct encoding_map), /*tp_basicsize*/
7775 0, /*tp_itemsize*/
7776 /* methods */
7777 encoding_map_dealloc, /*tp_dealloc*/
7778 0, /*tp_print*/
7779 0, /*tp_getattr*/
7780 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007781 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 0, /*tp_repr*/
7783 0, /*tp_as_number*/
7784 0, /*tp_as_sequence*/
7785 0, /*tp_as_mapping*/
7786 0, /*tp_hash*/
7787 0, /*tp_call*/
7788 0, /*tp_str*/
7789 0, /*tp_getattro*/
7790 0, /*tp_setattro*/
7791 0, /*tp_as_buffer*/
7792 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7793 0, /*tp_doc*/
7794 0, /*tp_traverse*/
7795 0, /*tp_clear*/
7796 0, /*tp_richcompare*/
7797 0, /*tp_weaklistoffset*/
7798 0, /*tp_iter*/
7799 0, /*tp_iternext*/
7800 encoding_map_methods, /*tp_methods*/
7801 0, /*tp_members*/
7802 0, /*tp_getset*/
7803 0, /*tp_base*/
7804 0, /*tp_dict*/
7805 0, /*tp_descr_get*/
7806 0, /*tp_descr_set*/
7807 0, /*tp_dictoffset*/
7808 0, /*tp_init*/
7809 0, /*tp_alloc*/
7810 0, /*tp_new*/
7811 0, /*tp_free*/
7812 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007813};
7814
7815PyObject*
7816PyUnicode_BuildEncodingMap(PyObject* string)
7817{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818 PyObject *result;
7819 struct encoding_map *mresult;
7820 int i;
7821 int need_dict = 0;
7822 unsigned char level1[32];
7823 unsigned char level2[512];
7824 unsigned char *mlevel1, *mlevel2, *mlevel3;
7825 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 int kind;
7827 void *data;
7828 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007830 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007831 PyErr_BadArgument();
7832 return NULL;
7833 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834 kind = PyUnicode_KIND(string);
7835 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007836 memset(level1, 0xFF, sizeof level1);
7837 memset(level2, 0xFF, sizeof level2);
7838
7839 /* If there isn't a one-to-one mapping of NULL to \0,
7840 or if there are non-BMP characters, we need to use
7841 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007842 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843 need_dict = 1;
7844 for (i = 1; i < 256; i++) {
7845 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 ch = PyUnicode_READ(kind, data, i);
7847 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 need_dict = 1;
7849 break;
7850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007852 /* unmapped character */
7853 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007854 l1 = ch >> 11;
7855 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007856 if (level1[l1] == 0xFF)
7857 level1[l1] = count2++;
7858 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007860 }
7861
7862 if (count2 >= 0xFF || count3 >= 0xFF)
7863 need_dict = 1;
7864
7865 if (need_dict) {
7866 PyObject *result = PyDict_New();
7867 PyObject *key, *value;
7868 if (!result)
7869 return NULL;
7870 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007872 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873 if (!key || !value)
7874 goto failed1;
7875 if (PyDict_SetItem(result, key, value) == -1)
7876 goto failed1;
7877 Py_DECREF(key);
7878 Py_DECREF(value);
7879 }
7880 return result;
7881 failed1:
7882 Py_XDECREF(key);
7883 Py_XDECREF(value);
7884 Py_DECREF(result);
7885 return NULL;
7886 }
7887
7888 /* Create a three-level trie */
7889 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7890 16*count2 + 128*count3 - 1);
7891 if (!result)
7892 return PyErr_NoMemory();
7893 PyObject_Init(result, &EncodingMapType);
7894 mresult = (struct encoding_map*)result;
7895 mresult->count2 = count2;
7896 mresult->count3 = count3;
7897 mlevel1 = mresult->level1;
7898 mlevel2 = mresult->level23;
7899 mlevel3 = mresult->level23 + 16*count2;
7900 memcpy(mlevel1, level1, 32);
7901 memset(mlevel2, 0xFF, 16*count2);
7902 memset(mlevel3, 0, 128*count3);
7903 count3 = 0;
7904 for (i = 1; i < 256; i++) {
7905 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907 /* unmapped character */
7908 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 o1 = PyUnicode_READ(kind, data, i)>>11;
7910 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007911 i2 = 16*mlevel1[o1] + o2;
7912 if (mlevel2[i2] == 0xFF)
7913 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007915 i3 = 128*mlevel2[i2] + o3;
7916 mlevel3[i3] = i;
7917 }
7918 return result;
7919}
7920
7921static int
Victor Stinner22168992011-11-20 17:09:18 +01007922encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007923{
7924 struct encoding_map *map = (struct encoding_map*)mapping;
7925 int l1 = c>>11;
7926 int l2 = (c>>7) & 0xF;
7927 int l3 = c & 0x7F;
7928 int i;
7929
Victor Stinner22168992011-11-20 17:09:18 +01007930 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932 if (c == 0)
7933 return 0;
7934 /* level 1*/
7935 i = map->level1[l1];
7936 if (i == 0xFF) {
7937 return -1;
7938 }
7939 /* level 2*/
7940 i = map->level23[16*i+l2];
7941 if (i == 0xFF) {
7942 return -1;
7943 }
7944 /* level 3 */
7945 i = map->level23[16*map->count2 + 128*i + l3];
7946 if (i == 0) {
7947 return -1;
7948 }
7949 return i;
7950}
7951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952/* Lookup the character ch in the mapping. If the character
7953 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007954 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007955static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007956charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957{
Christian Heimes217cfd12007-12-02 14:31:20 +00007958 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959 PyObject *x;
7960
7961 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007963 x = PyObject_GetItem(mapping, w);
7964 Py_DECREF(w);
7965 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7967 /* No mapping found means: mapping is undefined. */
7968 PyErr_Clear();
7969 x = Py_None;
7970 Py_INCREF(x);
7971 return x;
7972 } else
7973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007975 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007977 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 long value = PyLong_AS_LONG(x);
7979 if (value < 0 || value > 255) {
7980 PyErr_SetString(PyExc_TypeError,
7981 "character mapping must be in range(256)");
7982 Py_DECREF(x);
7983 return NULL;
7984 }
7985 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007987 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 /* wrong return value */
7991 PyErr_Format(PyExc_TypeError,
7992 "character mapping must return integer, bytes or None, not %.400s",
7993 x->ob_type->tp_name);
7994 Py_DECREF(x);
7995 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 }
7997}
7998
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007999static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008000charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008001{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8003 /* exponentially overallocate to minimize reallocations */
8004 if (requiredsize < 2*outsize)
8005 requiredsize = 2*outsize;
8006 if (_PyBytes_Resize(outobj, requiredsize))
8007 return -1;
8008 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009}
8010
Benjamin Peterson14339b62009-01-31 16:36:08 +00008011typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008013} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008014/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008015 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008016 space is available. Return a new reference to the object that
8017 was put in the output buffer, or Py_None, if the mapping was undefined
8018 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008019 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008020static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008021charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008022 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008023{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024 PyObject *rep;
8025 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008026 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008027
Christian Heimes90aa7642007-12-19 02:45:37 +00008028 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008031 if (res == -1)
8032 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 if (outsize<requiredsize)
8034 if (charmapencode_resize(outobj, outpos, requiredsize))
8035 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008036 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 outstart[(*outpos)++] = (char)res;
8038 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039 }
8040
8041 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 Py_DECREF(rep);
8046 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 if (PyLong_Check(rep)) {
8049 Py_ssize_t requiredsize = *outpos+1;
8050 if (outsize<requiredsize)
8051 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8052 Py_DECREF(rep);
8053 return enc_EXCEPTION;
8054 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008055 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 else {
8059 const char *repchars = PyBytes_AS_STRING(rep);
8060 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8061 Py_ssize_t requiredsize = *outpos+repsize;
8062 if (outsize<requiredsize)
8063 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8064 Py_DECREF(rep);
8065 return enc_EXCEPTION;
8066 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008067 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 memcpy(outstart + *outpos, repchars, repsize);
8069 *outpos += repsize;
8070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008071 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072 Py_DECREF(rep);
8073 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074}
8075
8076/* handle an error in PyUnicode_EncodeCharmap
8077 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008078static int
8079charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008080 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008082 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008083 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084{
8085 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008086 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008087 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008088 enum PyUnicode_Kind kind;
8089 void *data;
8090 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008092 Py_ssize_t collstartpos = *inpos;
8093 Py_ssize_t collendpos = *inpos+1;
8094 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 char *encoding = "charmap";
8096 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008098 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008099 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 if (PyUnicode_READY(unicode) < 0)
8102 return -1;
8103 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 /* find all unencodable characters */
8105 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008107 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008108 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008109 val = encoding_map_lookup(ch, mapping);
8110 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 break;
8112 ++collendpos;
8113 continue;
8114 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008115
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008116 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8117 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 if (rep==NULL)
8119 return -1;
8120 else if (rep!=Py_None) {
8121 Py_DECREF(rep);
8122 break;
8123 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008124 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008126 }
8127 /* cache callback name lookup
8128 * (if not done yet, i.e. it's the first error) */
8129 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 if ((errors==NULL) || (!strcmp(errors, "strict")))
8131 *known_errorHandler = 1;
8132 else if (!strcmp(errors, "replace"))
8133 *known_errorHandler = 2;
8134 else if (!strcmp(errors, "ignore"))
8135 *known_errorHandler = 3;
8136 else if (!strcmp(errors, "xmlcharrefreplace"))
8137 *known_errorHandler = 4;
8138 else
8139 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 }
8141 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008143 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008144 return -1;
8145 case 2: /* replace */
8146 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 x = charmapencode_output('?', mapping, res, respos);
8148 if (x==enc_EXCEPTION) {
8149 return -1;
8150 }
8151 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008152 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 return -1;
8154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008155 }
8156 /* fall through */
8157 case 3: /* ignore */
8158 *inpos = collendpos;
8159 break;
8160 case 4: /* xmlcharrefreplace */
8161 /* generate replacement (temporarily (mis)uses p) */
8162 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 char buffer[2+29+1+1];
8164 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008165 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 for (cp = buffer; *cp; ++cp) {
8167 x = charmapencode_output(*cp, mapping, res, respos);
8168 if (x==enc_EXCEPTION)
8169 return -1;
8170 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008171 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 return -1;
8173 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008174 }
8175 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 *inpos = collendpos;
8177 break;
8178 default:
8179 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008180 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008184 if (PyBytes_Check(repunicode)) {
8185 /* Directly copy bytes result to output. */
8186 Py_ssize_t outsize = PyBytes_Size(*res);
8187 Py_ssize_t requiredsize;
8188 repsize = PyBytes_Size(repunicode);
8189 requiredsize = *respos + repsize;
8190 if (requiredsize > outsize)
8191 /* Make room for all additional bytes. */
8192 if (charmapencode_resize(res, respos, requiredsize)) {
8193 Py_DECREF(repunicode);
8194 return -1;
8195 }
8196 memcpy(PyBytes_AsString(*res) + *respos,
8197 PyBytes_AsString(repunicode), repsize);
8198 *respos += repsize;
8199 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008200 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008201 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008202 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008204 if (PyUnicode_READY(repunicode) < 0) {
8205 Py_DECREF(repunicode);
8206 return -1;
8207 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008208 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008209 data = PyUnicode_DATA(repunicode);
8210 kind = PyUnicode_KIND(repunicode);
8211 for (index = 0; index < repsize; index++) {
8212 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8213 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008215 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 return -1;
8217 }
8218 else if (x==enc_FAILED) {
8219 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008220 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 return -1;
8222 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 }
8224 *inpos = newpos;
8225 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008226 }
8227 return 0;
8228}
8229
Alexander Belopolsky40018472011-02-26 01:02:56 +00008230PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008231_PyUnicode_EncodeCharmap(PyObject *unicode,
8232 PyObject *mapping,
8233 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 /* output object */
8236 PyObject *res = NULL;
8237 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008238 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008239 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008241 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 PyObject *errorHandler = NULL;
8243 PyObject *exc = NULL;
8244 /* the following variable is used for caching string comparisons
8245 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8246 * 3=ignore, 4=xmlcharrefreplace */
8247 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008249 if (PyUnicode_READY(unicode) < 0)
8250 return NULL;
8251 size = PyUnicode_GET_LENGTH(unicode);
8252
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 /* Default to Latin-1 */
8254 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008255 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 /* allocate enough for a simple encoding without
8258 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008259 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 if (res == NULL)
8261 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008262 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008266 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008268 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 if (x==enc_EXCEPTION) /* error */
8270 goto onError;
8271 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008272 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 &exc,
8274 &known_errorHandler, &errorHandler, errors,
8275 &res, &respos)) {
8276 goto onError;
8277 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008278 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 else
8280 /* done with this character => adjust input position */
8281 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008285 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008286 if (_PyBytes_Resize(&res, respos) < 0)
8287 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008288
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 Py_XDECREF(exc);
8290 Py_XDECREF(errorHandler);
8291 return res;
8292
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 Py_XDECREF(res);
8295 Py_XDECREF(exc);
8296 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 return NULL;
8298}
8299
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008300/* Deprecated */
8301PyObject *
8302PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8303 Py_ssize_t size,
8304 PyObject *mapping,
8305 const char *errors)
8306{
8307 PyObject *result;
8308 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8309 if (unicode == NULL)
8310 return NULL;
8311 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8312 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008313 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008314}
8315
Alexander Belopolsky40018472011-02-26 01:02:56 +00008316PyObject *
8317PyUnicode_AsCharmapString(PyObject *unicode,
8318 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319{
8320 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 PyErr_BadArgument();
8322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008324 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325}
8326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328static void
8329make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331 Py_ssize_t startpos, Py_ssize_t endpos,
8332 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 *exceptionObject = _PyUnicodeTranslateError_Create(
8336 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 }
8338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8340 goto onError;
8341 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8342 goto onError;
8343 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8344 goto onError;
8345 return;
8346 onError:
8347 Py_DECREF(*exceptionObject);
8348 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 }
8350}
8351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008353static void
8354raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356 Py_ssize_t startpos, Py_ssize_t endpos,
8357 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358{
8359 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363}
8364
8365/* error handling callback helper:
8366 build arguments, call the callback and check the arguments,
8367 put the result into newpos and return the replacement string, which
8368 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008369static PyObject *
8370unicode_translate_call_errorhandler(const char *errors,
8371 PyObject **errorHandler,
8372 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374 Py_ssize_t startpos, Py_ssize_t endpos,
8375 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008377 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008379 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 PyObject *restuple;
8381 PyObject *resunicode;
8382
8383 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 }
8388
8389 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393
8394 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008399 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 Py_DECREF(restuple);
8401 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 }
8403 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 &resunicode, &i_newpos)) {
8405 Py_DECREF(restuple);
8406 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008408 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008410 else
8411 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8414 Py_DECREF(restuple);
8415 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008416 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 Py_INCREF(resunicode);
8418 Py_DECREF(restuple);
8419 return resunicode;
8420}
8421
8422/* Lookup the character ch in the mapping and put the result in result,
8423 which must be decrefed by the caller.
8424 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008425static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427{
Christian Heimes217cfd12007-12-02 14:31:20 +00008428 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 PyObject *x;
8430
8431 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 x = PyObject_GetItem(mapping, w);
8434 Py_DECREF(w);
8435 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8437 /* No mapping found means: use 1:1 mapping. */
8438 PyErr_Clear();
8439 *result = NULL;
8440 return 0;
8441 } else
8442 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 }
8444 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 *result = x;
8446 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008448 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 long value = PyLong_AS_LONG(x);
8450 long max = PyUnicode_GetMax();
8451 if (value < 0 || value > max) {
8452 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008453 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 Py_DECREF(x);
8455 return -1;
8456 }
8457 *result = x;
8458 return 0;
8459 }
8460 else if (PyUnicode_Check(x)) {
8461 *result = x;
8462 return 0;
8463 }
8464 else {
8465 /* wrong return value */
8466 PyErr_SetString(PyExc_TypeError,
8467 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 Py_DECREF(x);
8469 return -1;
8470 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471}
8472/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 if not reallocate and adjust various state variables.
8474 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008475static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008480 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 /* exponentially overallocate to minimize reallocations */
8482 if (requiredsize < 2 * oldsize)
8483 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8485 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488 }
8489 return 0;
8490}
8491/* lookup the character, put the result in the output string and adjust
8492 various state variables. Return a new reference to the object that
8493 was put in the output buffer in *result, or Py_None, if the mapping was
8494 undefined (in which case no character was written).
8495 The called must decref result.
8496 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008497static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8499 PyObject *mapping, Py_UCS4 **output,
8500 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008501 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8504 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509 }
8510 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008512 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 }
8516 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 Py_ssize_t repsize;
8518 if (PyUnicode_READY(*res) == -1)
8519 return -1;
8520 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 if (repsize==1) {
8522 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 }
8525 else if (repsize!=0) {
8526 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 Py_ssize_t requiredsize = *opos +
8528 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 Py_ssize_t i;
8531 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 for(i = 0; i < repsize; i++)
8534 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008536 }
8537 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 return 0;
8540}
8541
Alexander Belopolsky40018472011-02-26 01:02:56 +00008542PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543_PyUnicode_TranslateCharmap(PyObject *input,
8544 PyObject *mapping,
8545 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 /* input object */
8548 char *idata;
8549 Py_ssize_t size, i;
8550 int kind;
8551 /* output buffer */
8552 Py_UCS4 *output = NULL;
8553 Py_ssize_t osize;
8554 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 char *reason = "character maps to <undefined>";
8558 PyObject *errorHandler = NULL;
8559 PyObject *exc = NULL;
8560 /* the following variable is used for caching string comparisons
8561 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8562 * 3=ignore, 4=xmlcharrefreplace */
8563 int known_errorHandler = -1;
8564
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 PyErr_BadArgument();
8567 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 if (PyUnicode_READY(input) == -1)
8571 return NULL;
8572 idata = (char*)PyUnicode_DATA(input);
8573 kind = PyUnicode_KIND(input);
8574 size = PyUnicode_GET_LENGTH(input);
8575 i = 0;
8576
8577 if (size == 0) {
8578 Py_INCREF(input);
8579 return input;
8580 }
8581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 /* allocate enough for a simple 1:1 translation without
8583 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 osize = size;
8585 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8586 opos = 0;
8587 if (output == NULL) {
8588 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 /* try to encode it */
8594 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 if (charmaptranslate_output(input, i, mapping,
8596 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 Py_XDECREF(x);
8598 goto onError;
8599 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008600 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 else { /* untranslatable character */
8604 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8605 Py_ssize_t repsize;
8606 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 Py_ssize_t collstart = i;
8610 Py_ssize_t collend = i+1;
8611 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 while (collend < size) {
8615 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 goto onError;
8617 Py_XDECREF(x);
8618 if (x!=Py_None)
8619 break;
8620 ++collend;
8621 }
8622 /* cache callback name lookup
8623 * (if not done yet, i.e. it's the first error) */
8624 if (known_errorHandler==-1) {
8625 if ((errors==NULL) || (!strcmp(errors, "strict")))
8626 known_errorHandler = 1;
8627 else if (!strcmp(errors, "replace"))
8628 known_errorHandler = 2;
8629 else if (!strcmp(errors, "ignore"))
8630 known_errorHandler = 3;
8631 else if (!strcmp(errors, "xmlcharrefreplace"))
8632 known_errorHandler = 4;
8633 else
8634 known_errorHandler = 0;
8635 }
8636 switch (known_errorHandler) {
8637 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 raise_translate_exception(&exc, input, collstart,
8639 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008640 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 case 2: /* replace */
8642 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 for (coll = collstart; coll<collend; coll++)
8644 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 /* fall through */
8646 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 break;
8649 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 /* generate replacement (temporarily (mis)uses i) */
8651 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 char buffer[2+29+1+1];
8653 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8655 if (charmaptranslate_makespace(&output, &osize,
8656 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 goto onError;
8658 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 break;
8663 default:
8664 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 reason, input, &exc,
8666 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008667 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008669 if (PyUnicode_READY(repunicode) < 0) {
8670 Py_DECREF(repunicode);
8671 goto onError;
8672 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 repsize = PyUnicode_GET_LENGTH(repunicode);
8675 if (charmaptranslate_makespace(&output, &osize,
8676 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 Py_DECREF(repunicode);
8678 goto onError;
8679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 for (uni2 = 0; repsize-->0; ++uni2)
8681 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8682 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008684 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008685 }
8686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8688 if (!res)
8689 goto onError;
8690 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 Py_XDECREF(exc);
8692 Py_XDECREF(errorHandler);
8693 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 Py_XDECREF(exc);
8698 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 return NULL;
8700}
8701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702/* Deprecated. Use PyUnicode_Translate instead. */
8703PyObject *
8704PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8705 Py_ssize_t size,
8706 PyObject *mapping,
8707 const char *errors)
8708{
8709 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8710 if (!unicode)
8711 return NULL;
8712 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8713}
8714
Alexander Belopolsky40018472011-02-26 01:02:56 +00008715PyObject *
8716PyUnicode_Translate(PyObject *str,
8717 PyObject *mapping,
8718 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719{
8720 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008721
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 str = PyUnicode_FromObject(str);
8723 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 Py_DECREF(str);
8727 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008728
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 Py_XDECREF(str);
8731 return NULL;
8732}
Tim Petersced69f82003-09-16 20:30:58 +00008733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008735fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736{
8737 /* No need to call PyUnicode_READY(self) because this function is only
8738 called as a callback from fixup() which does it already. */
8739 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8740 const int kind = PyUnicode_KIND(self);
8741 void *data = PyUnicode_DATA(self);
8742 Py_UCS4 maxchar = 0, ch, fixed;
8743 Py_ssize_t i;
8744
8745 for (i = 0; i < len; ++i) {
8746 ch = PyUnicode_READ(kind, data, i);
8747 fixed = 0;
8748 if (ch > 127) {
8749 if (Py_UNICODE_ISSPACE(ch))
8750 fixed = ' ';
8751 else {
8752 const int decimal = Py_UNICODE_TODECIMAL(ch);
8753 if (decimal >= 0)
8754 fixed = '0' + decimal;
8755 }
8756 if (fixed != 0) {
8757 if (fixed > maxchar)
8758 maxchar = fixed;
8759 PyUnicode_WRITE(kind, data, i, fixed);
8760 }
8761 else if (ch > maxchar)
8762 maxchar = ch;
8763 }
8764 else if (ch > maxchar)
8765 maxchar = ch;
8766 }
8767
8768 return maxchar;
8769}
8770
8771PyObject *
8772_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8773{
8774 if (!PyUnicode_Check(unicode)) {
8775 PyErr_BadInternalCall();
8776 return NULL;
8777 }
8778 if (PyUnicode_READY(unicode) == -1)
8779 return NULL;
8780 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8781 /* If the string is already ASCII, just return the same string */
8782 Py_INCREF(unicode);
8783 return unicode;
8784 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008785 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786}
8787
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008788PyObject *
8789PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8790 Py_ssize_t length)
8791{
Victor Stinnerf0124502011-11-21 23:12:56 +01008792 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008793 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008794 Py_UCS4 maxchar;
8795 enum PyUnicode_Kind kind;
8796 void *data;
8797
8798 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008799 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008800 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008801 if (ch > 127) {
8802 int decimal = Py_UNICODE_TODECIMAL(ch);
8803 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008804 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008805 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008806 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008807 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008808
8809 /* Copy to a new string */
8810 decimal = PyUnicode_New(length, maxchar);
8811 if (decimal == NULL)
8812 return decimal;
8813 kind = PyUnicode_KIND(decimal);
8814 data = PyUnicode_DATA(decimal);
8815 /* Iterate over code points */
8816 for (i = 0; i < length; i++) {
8817 Py_UNICODE ch = s[i];
8818 if (ch > 127) {
8819 int decimal = Py_UNICODE_TODECIMAL(ch);
8820 if (decimal >= 0)
8821 ch = '0' + decimal;
8822 }
8823 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008825 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008826}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008827/* --- Decimal Encoder ---------------------------------------------------- */
8828
Alexander Belopolsky40018472011-02-26 01:02:56 +00008829int
8830PyUnicode_EncodeDecimal(Py_UNICODE *s,
8831 Py_ssize_t length,
8832 char *output,
8833 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008834{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008835 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008836 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008837 enum PyUnicode_Kind kind;
8838 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008839
8840 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 PyErr_BadArgument();
8842 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008843 }
8844
Victor Stinner42bf7752011-11-21 22:52:58 +01008845 unicode = PyUnicode_FromUnicode(s, length);
8846 if (unicode == NULL)
8847 return -1;
8848
Victor Stinner6345be92011-11-25 20:09:01 +01008849 if (PyUnicode_READY(unicode) < 0) {
8850 Py_DECREF(unicode);
8851 return -1;
8852 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008853 kind = PyUnicode_KIND(unicode);
8854 data = PyUnicode_DATA(unicode);
8855
Victor Stinnerb84d7232011-11-22 01:50:07 +01008856 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008857 PyObject *exc;
8858 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008860 Py_ssize_t startpos;
8861
8862 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008863
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008865 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008866 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008867 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008868 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 decimal = Py_UNICODE_TODECIMAL(ch);
8870 if (decimal >= 0) {
8871 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008872 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 continue;
8874 }
8875 if (0 < ch && ch < 256) {
8876 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008877 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 continue;
8879 }
Victor Stinner6345be92011-11-25 20:09:01 +01008880
Victor Stinner42bf7752011-11-21 22:52:58 +01008881 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008882 exc = NULL;
8883 raise_encode_exception(&exc, "decimal", unicode,
8884 startpos, startpos+1,
8885 "invalid decimal Unicode string");
8886 Py_XDECREF(exc);
8887 Py_DECREF(unicode);
8888 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008889 }
8890 /* 0-terminate the output string */
8891 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008892 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008893 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008894}
8895
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896/* --- Helpers ------------------------------------------------------------ */
8897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008899any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 Py_ssize_t start,
8901 Py_ssize_t end)
8902{
8903 int kind1, kind2, kind;
8904 void *buf1, *buf2;
8905 Py_ssize_t len1, len2, result;
8906
8907 kind1 = PyUnicode_KIND(s1);
8908 kind2 = PyUnicode_KIND(s2);
8909 kind = kind1 > kind2 ? kind1 : kind2;
8910 buf1 = PyUnicode_DATA(s1);
8911 buf2 = PyUnicode_DATA(s2);
8912 if (kind1 != kind)
8913 buf1 = _PyUnicode_AsKind(s1, kind);
8914 if (!buf1)
8915 return -2;
8916 if (kind2 != kind)
8917 buf2 = _PyUnicode_AsKind(s2, kind);
8918 if (!buf2) {
8919 if (kind1 != kind) PyMem_Free(buf1);
8920 return -2;
8921 }
8922 len1 = PyUnicode_GET_LENGTH(s1);
8923 len2 = PyUnicode_GET_LENGTH(s2);
8924
Victor Stinner794d5672011-10-10 03:21:36 +02008925 if (direction > 0) {
8926 switch(kind) {
8927 case PyUnicode_1BYTE_KIND:
8928 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8929 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8930 else
8931 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8932 break;
8933 case PyUnicode_2BYTE_KIND:
8934 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8935 break;
8936 case PyUnicode_4BYTE_KIND:
8937 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8938 break;
8939 default:
8940 assert(0); result = -2;
8941 }
8942 }
8943 else {
8944 switch(kind) {
8945 case PyUnicode_1BYTE_KIND:
8946 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8947 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8948 else
8949 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8950 break;
8951 case PyUnicode_2BYTE_KIND:
8952 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8953 break;
8954 case PyUnicode_4BYTE_KIND:
8955 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8956 break;
8957 default:
8958 assert(0); result = -2;
8959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 }
8961
8962 if (kind1 != kind)
8963 PyMem_Free(buf1);
8964 if (kind2 != kind)
8965 PyMem_Free(buf2);
8966
8967 return result;
8968}
8969
8970Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008971_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 Py_ssize_t n_buffer,
8973 void *digits, Py_ssize_t n_digits,
8974 Py_ssize_t min_width,
8975 const char *grouping,
8976 const char *thousands_sep)
8977{
8978 switch(kind) {
8979 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008980 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8981 return _PyUnicode_ascii_InsertThousandsGrouping(
8982 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8983 min_width, grouping, thousands_sep);
8984 else
8985 return _PyUnicode_ucs1_InsertThousandsGrouping(
8986 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8987 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 case PyUnicode_2BYTE_KIND:
8989 return _PyUnicode_ucs2_InsertThousandsGrouping(
8990 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8991 min_width, grouping, thousands_sep);
8992 case PyUnicode_4BYTE_KIND:
8993 return _PyUnicode_ucs4_InsertThousandsGrouping(
8994 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8995 min_width, grouping, thousands_sep);
8996 }
8997 assert(0);
8998 return -1;
8999}
9000
9001
Thomas Wouters477c8d52006-05-27 19:21:47 +00009002/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009003#define ADJUST_INDICES(start, end, len) \
9004 if (end > len) \
9005 end = len; \
9006 else if (end < 0) { \
9007 end += len; \
9008 if (end < 0) \
9009 end = 0; \
9010 } \
9011 if (start < 0) { \
9012 start += len; \
9013 if (start < 0) \
9014 start = 0; \
9015 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009016
Alexander Belopolsky40018472011-02-26 01:02:56 +00009017Py_ssize_t
9018PyUnicode_Count(PyObject *str,
9019 PyObject *substr,
9020 Py_ssize_t start,
9021 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009023 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009024 PyObject* str_obj;
9025 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 int kind1, kind2, kind;
9027 void *buf1 = NULL, *buf2 = NULL;
9028 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009029
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009030 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009033 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009034 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 Py_DECREF(str_obj);
9036 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 }
Tim Petersced69f82003-09-16 20:30:58 +00009038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 kind1 = PyUnicode_KIND(str_obj);
9040 kind2 = PyUnicode_KIND(sub_obj);
9041 kind = kind1 > kind2 ? kind1 : kind2;
9042 buf1 = PyUnicode_DATA(str_obj);
9043 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009044 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 if (!buf1)
9046 goto onError;
9047 buf2 = PyUnicode_DATA(sub_obj);
9048 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009049 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 if (!buf2)
9051 goto onError;
9052 len1 = PyUnicode_GET_LENGTH(str_obj);
9053 len2 = PyUnicode_GET_LENGTH(sub_obj);
9054
9055 ADJUST_INDICES(start, end, len1);
9056 switch(kind) {
9057 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009058 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9059 result = asciilib_count(
9060 ((Py_UCS1*)buf1) + start, end - start,
9061 buf2, len2, PY_SSIZE_T_MAX
9062 );
9063 else
9064 result = ucs1lib_count(
9065 ((Py_UCS1*)buf1) + start, end - start,
9066 buf2, len2, PY_SSIZE_T_MAX
9067 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 break;
9069 case PyUnicode_2BYTE_KIND:
9070 result = ucs2lib_count(
9071 ((Py_UCS2*)buf1) + start, end - start,
9072 buf2, len2, PY_SSIZE_T_MAX
9073 );
9074 break;
9075 case PyUnicode_4BYTE_KIND:
9076 result = ucs4lib_count(
9077 ((Py_UCS4*)buf1) + start, end - start,
9078 buf2, len2, PY_SSIZE_T_MAX
9079 );
9080 break;
9081 default:
9082 assert(0); result = 0;
9083 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009084
9085 Py_DECREF(sub_obj);
9086 Py_DECREF(str_obj);
9087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 if (kind1 != kind)
9089 PyMem_Free(buf1);
9090 if (kind2 != kind)
9091 PyMem_Free(buf2);
9092
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 onError:
9095 Py_DECREF(sub_obj);
9096 Py_DECREF(str_obj);
9097 if (kind1 != kind && buf1)
9098 PyMem_Free(buf1);
9099 if (kind2 != kind && buf2)
9100 PyMem_Free(buf2);
9101 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102}
9103
Alexander Belopolsky40018472011-02-26 01:02:56 +00009104Py_ssize_t
9105PyUnicode_Find(PyObject *str,
9106 PyObject *sub,
9107 Py_ssize_t start,
9108 Py_ssize_t end,
9109 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009111 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009112
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009116 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 Py_DECREF(str);
9119 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120 }
Tim Petersced69f82003-09-16 20:30:58 +00009121
Victor Stinner794d5672011-10-10 03:21:36 +02009122 result = any_find_slice(direction,
9123 str, sub, start, end
9124 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009125
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009127 Py_DECREF(sub);
9128
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 return result;
9130}
9131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132Py_ssize_t
9133PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9134 Py_ssize_t start, Py_ssize_t end,
9135 int direction)
9136{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009138 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 if (PyUnicode_READY(str) == -1)
9140 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009141 if (start < 0 || end < 0) {
9142 PyErr_SetString(PyExc_IndexError, "string index out of range");
9143 return -2;
9144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 if (end > PyUnicode_GET_LENGTH(str))
9146 end = PyUnicode_GET_LENGTH(str);
9147 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009148 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9149 kind, end-start, ch, direction);
9150 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009152 else
9153 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154}
9155
Alexander Belopolsky40018472011-02-26 01:02:56 +00009156static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009157tailmatch(PyObject *self,
9158 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159 Py_ssize_t start,
9160 Py_ssize_t end,
9161 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 int kind_self;
9164 int kind_sub;
9165 void *data_self;
9166 void *data_sub;
9167 Py_ssize_t offset;
9168 Py_ssize_t i;
9169 Py_ssize_t end_sub;
9170
9171 if (PyUnicode_READY(self) == -1 ||
9172 PyUnicode_READY(substring) == -1)
9173 return 0;
9174
9175 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176 return 1;
9177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9179 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 kind_self = PyUnicode_KIND(self);
9184 data_self = PyUnicode_DATA(self);
9185 kind_sub = PyUnicode_KIND(substring);
9186 data_sub = PyUnicode_DATA(substring);
9187 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9188
9189 if (direction > 0)
9190 offset = end;
9191 else
9192 offset = start;
9193
9194 if (PyUnicode_READ(kind_self, data_self, offset) ==
9195 PyUnicode_READ(kind_sub, data_sub, 0) &&
9196 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9197 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9198 /* If both are of the same kind, memcmp is sufficient */
9199 if (kind_self == kind_sub) {
9200 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009201 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 data_sub,
9203 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009204 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 }
9206 /* otherwise we have to compare each character by first accesing it */
9207 else {
9208 /* We do not need to compare 0 and len(substring)-1 because
9209 the if statement above ensured already that they are equal
9210 when we end up here. */
9211 // TODO: honor direction and do a forward or backwards search
9212 for (i = 1; i < end_sub; ++i) {
9213 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9214 PyUnicode_READ(kind_sub, data_sub, i))
9215 return 0;
9216 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 }
9220
9221 return 0;
9222}
9223
Alexander Belopolsky40018472011-02-26 01:02:56 +00009224Py_ssize_t
9225PyUnicode_Tailmatch(PyObject *str,
9226 PyObject *substr,
9227 Py_ssize_t start,
9228 Py_ssize_t end,
9229 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009231 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009232
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233 str = PyUnicode_FromObject(str);
9234 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 substr = PyUnicode_FromObject(substr);
9237 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 Py_DECREF(str);
9239 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240 }
Tim Petersced69f82003-09-16 20:30:58 +00009241
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009242 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 Py_DECREF(str);
9245 Py_DECREF(substr);
9246 return result;
9247}
9248
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249/* Apply fixfct filter to the Unicode object self and return a
9250 reference to the modified object */
9251
Alexander Belopolsky40018472011-02-26 01:02:56 +00009252static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009253fixup(PyObject *self,
9254 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 PyObject *u;
9257 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258
Victor Stinner87af4f22011-11-21 23:03:47 +01009259 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009261 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009262 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 /* fix functions return the new maximum character in a string,
9265 if the kind of the resulting unicode object does not change,
9266 everything is fine. Otherwise we need to change the string kind
9267 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009268 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 if (maxchar_new == 0)
9270 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9271 else if (maxchar_new <= 127)
9272 maxchar_new = 127;
9273 else if (maxchar_new <= 255)
9274 maxchar_new = 255;
9275 else if (maxchar_new <= 65535)
9276 maxchar_new = 65535;
9277 else
9278 maxchar_new = 1114111; /* 0x10ffff */
9279
9280 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009281 /* fixfct should return TRUE if it modified the buffer. If
9282 FALSE, return a reference to the original buffer instead
9283 (to save space, not time) */
9284 Py_INCREF(self);
9285 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009286 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 else if (maxchar_new == maxchar_old) {
9289 return u;
9290 }
9291 else {
9292 /* In case the maximum character changed, we need to
9293 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009294 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 if (v == NULL) {
9296 Py_DECREF(u);
9297 return NULL;
9298 }
9299 if (maxchar_new > maxchar_old) {
9300 /* If the maxchar increased so that the kind changed, not all
9301 characters are representable anymore and we need to fix the
9302 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009303 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009304 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9306 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009307 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009308 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310
9311 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009312 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 return v;
9314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315}
9316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009318fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 /* No need to call PyUnicode_READY(self) because this function is only
9321 called as a callback from fixup() which does it already. */
9322 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9323 const int kind = PyUnicode_KIND(self);
9324 void *data = PyUnicode_DATA(self);
9325 int touched = 0;
9326 Py_UCS4 maxchar = 0;
9327 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 for (i = 0; i < len; ++i) {
9330 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9331 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9332 if (up != ch) {
9333 if (up > maxchar)
9334 maxchar = up;
9335 PyUnicode_WRITE(kind, data, i, up);
9336 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 else if (ch > maxchar)
9339 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340 }
9341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 if (touched)
9343 return maxchar;
9344 else
9345 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346}
9347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009349fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9352 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9353 const int kind = PyUnicode_KIND(self);
9354 void *data = PyUnicode_DATA(self);
9355 int touched = 0;
9356 Py_UCS4 maxchar = 0;
9357 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 for(i = 0; i < len; ++i) {
9360 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9361 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9362 if (lo != ch) {
9363 if (lo > maxchar)
9364 maxchar = lo;
9365 PyUnicode_WRITE(kind, data, i, lo);
9366 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 else if (ch > maxchar)
9369 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 }
9371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 if (touched)
9373 return maxchar;
9374 else
9375 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376}
9377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009379fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9382 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9383 const int kind = PyUnicode_KIND(self);
9384 void *data = PyUnicode_DATA(self);
9385 int touched = 0;
9386 Py_UCS4 maxchar = 0;
9387 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 for(i = 0; i < len; ++i) {
9390 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9391 Py_UCS4 nu = 0;
9392
9393 if (Py_UNICODE_ISUPPER(ch))
9394 nu = Py_UNICODE_TOLOWER(ch);
9395 else if (Py_UNICODE_ISLOWER(ch))
9396 nu = Py_UNICODE_TOUPPER(ch);
9397
9398 if (nu != 0) {
9399 if (nu > maxchar)
9400 maxchar = nu;
9401 PyUnicode_WRITE(kind, data, i, nu);
9402 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 else if (ch > maxchar)
9405 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406 }
9407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 if (touched)
9409 return maxchar;
9410 else
9411 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412}
9413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009415fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9418 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9419 const int kind = PyUnicode_KIND(self);
9420 void *data = PyUnicode_DATA(self);
9421 int touched = 0;
9422 Py_UCS4 maxchar = 0;
9423 Py_ssize_t i = 0;
9424 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009425
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009426 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428
9429 ch = PyUnicode_READ(kind, data, i);
9430 if (!Py_UNICODE_ISUPPER(ch)) {
9431 maxchar = Py_UNICODE_TOUPPER(ch);
9432 PyUnicode_WRITE(kind, data, i, maxchar);
9433 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 ++i;
9436 for(; i < len; ++i) {
9437 ch = PyUnicode_READ(kind, data, i);
9438 if (!Py_UNICODE_ISLOWER(ch)) {
9439 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9440 if (lo > maxchar)
9441 maxchar = lo;
9442 PyUnicode_WRITE(kind, data, i, lo);
9443 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 else if (ch > maxchar)
9446 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448
9449 if (touched)
9450 return maxchar;
9451 else
9452 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453}
9454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009456fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9459 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9460 const int kind = PyUnicode_KIND(self);
9461 void *data = PyUnicode_DATA(self);
9462 Py_UCS4 maxchar = 0;
9463 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464 int previous_is_cased;
9465
9466 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 if (len == 1) {
9468 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9469 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9470 if (ti != ch) {
9471 PyUnicode_WRITE(kind, data, i, ti);
9472 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 }
9474 else
9475 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 for(; i < len; ++i) {
9479 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9480 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009481
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 nu = Py_UNICODE_TOTITLE(ch);
9486
9487 if (nu > maxchar)
9488 maxchar = nu;
9489 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009490
Benjamin Peterson29060642009-01-31 22:14:21 +00009491 if (Py_UNICODE_ISLOWER(ch) ||
9492 Py_UNICODE_ISUPPER(ch) ||
9493 Py_UNICODE_ISTITLE(ch))
9494 previous_is_cased = 1;
9495 else
9496 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499}
9500
Tim Peters8ce9f162004-08-27 01:49:32 +00009501PyObject *
9502PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009505 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009507 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009508 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9509 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009510 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009512 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009514 int use_memcpy;
9515 unsigned char *res_data = NULL, *sep_data = NULL;
9516 PyObject *last_obj;
9517 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518
Tim Peters05eba1f2004-08-27 21:32:02 +00009519 fseq = PySequence_Fast(seq, "");
9520 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009521 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009522 }
9523
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009524 /* NOTE: the following code can't call back into Python code,
9525 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009526 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009527
Tim Peters05eba1f2004-08-27 21:32:02 +00009528 seqlen = PySequence_Fast_GET_SIZE(fseq);
9529 /* If empty sequence, return u"". */
9530 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009531 Py_DECREF(fseq);
9532 Py_INCREF(unicode_empty);
9533 res = unicode_empty;
9534 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009535 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009536
Tim Peters05eba1f2004-08-27 21:32:02 +00009537 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009538 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009539 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009540 if (seqlen == 1) {
9541 if (PyUnicode_CheckExact(items[0])) {
9542 res = items[0];
9543 Py_INCREF(res);
9544 Py_DECREF(fseq);
9545 return res;
9546 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009547 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009548 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009549 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009550 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009551 /* Set up sep and seplen */
9552 if (separator == NULL) {
9553 /* fall back to a blank space separator */
9554 sep = PyUnicode_FromOrdinal(' ');
9555 if (!sep)
9556 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009557 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009558 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009559 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009560 else {
9561 if (!PyUnicode_Check(separator)) {
9562 PyErr_Format(PyExc_TypeError,
9563 "separator: expected str instance,"
9564 " %.80s found",
9565 Py_TYPE(separator)->tp_name);
9566 goto onError;
9567 }
9568 if (PyUnicode_READY(separator))
9569 goto onError;
9570 sep = separator;
9571 seplen = PyUnicode_GET_LENGTH(separator);
9572 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9573 /* inc refcount to keep this code path symmetric with the
9574 above case of a blank separator */
9575 Py_INCREF(sep);
9576 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009577 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009578 }
9579
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009580 /* There are at least two things to join, or else we have a subclass
9581 * of str in the sequence.
9582 * Do a pre-pass to figure out the total amount of space we'll
9583 * need (sz), and see whether all argument are strings.
9584 */
9585 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009586#ifdef Py_DEBUG
9587 use_memcpy = 0;
9588#else
9589 use_memcpy = 1;
9590#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009591 for (i = 0; i < seqlen; i++) {
9592 const Py_ssize_t old_sz = sz;
9593 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 if (!PyUnicode_Check(item)) {
9595 PyErr_Format(PyExc_TypeError,
9596 "sequence item %zd: expected str instance,"
9597 " %.80s found",
9598 i, Py_TYPE(item)->tp_name);
9599 goto onError;
9600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 if (PyUnicode_READY(item) == -1)
9602 goto onError;
9603 sz += PyUnicode_GET_LENGTH(item);
9604 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009605 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009606 if (i != 0)
9607 sz += seplen;
9608 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9609 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009611 goto onError;
9612 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009613 if (use_memcpy && last_obj != NULL) {
9614 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9615 use_memcpy = 0;
9616 }
9617 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009618 }
Tim Petersced69f82003-09-16 20:30:58 +00009619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009621 if (res == NULL)
9622 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009623
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009624 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009625#ifdef Py_DEBUG
9626 use_memcpy = 0;
9627#else
9628 if (use_memcpy) {
9629 res_data = PyUnicode_1BYTE_DATA(res);
9630 kind = PyUnicode_KIND(res);
9631 if (seplen != 0)
9632 sep_data = PyUnicode_1BYTE_DATA(sep);
9633 }
9634#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009636 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009637 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009638 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009639 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009640 if (use_memcpy) {
9641 Py_MEMCPY(res_data,
9642 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009643 kind * seplen);
9644 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009645 }
9646 else {
9647 copy_characters(res, res_offset, sep, 0, seplen);
9648 res_offset += seplen;
9649 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009651 itemlen = PyUnicode_GET_LENGTH(item);
9652 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009653 if (use_memcpy) {
9654 Py_MEMCPY(res_data,
9655 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009656 kind * itemlen);
9657 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009658 }
9659 else {
9660 copy_characters(res, res_offset, item, 0, itemlen);
9661 res_offset += itemlen;
9662 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009663 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009664 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009665 if (use_memcpy)
9666 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009667 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009668 else
9669 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009670
Tim Peters05eba1f2004-08-27 21:32:02 +00009671 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009673 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675
Benjamin Peterson29060642009-01-31 22:14:21 +00009676 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009677 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009679 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680 return NULL;
9681}
9682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683#define FILL(kind, data, value, start, length) \
9684 do { \
9685 Py_ssize_t i_ = 0; \
9686 assert(kind != PyUnicode_WCHAR_KIND); \
9687 switch ((kind)) { \
9688 case PyUnicode_1BYTE_KIND: { \
9689 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9690 memset(to_, (unsigned char)value, length); \
9691 break; \
9692 } \
9693 case PyUnicode_2BYTE_KIND: { \
9694 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9695 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9696 break; \
9697 } \
9698 default: { \
9699 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9700 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9701 break; \
9702 } \
9703 } \
9704 } while (0)
9705
Victor Stinner9310abb2011-10-05 00:59:23 +02009706static PyObject *
9707pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009708 Py_ssize_t left,
9709 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 PyObject *u;
9713 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009714 int kind;
9715 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716
9717 if (left < 0)
9718 left = 0;
9719 if (right < 0)
9720 right = 0;
9721
Tim Peters7a29bd52001-09-12 03:03:31 +00009722 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723 Py_INCREF(self);
9724 return self;
9725 }
9726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9728 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009729 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9730 return NULL;
9731 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9733 if (fill > maxchar)
9734 maxchar = fill;
9735 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009736 if (!u)
9737 return NULL;
9738
9739 kind = PyUnicode_KIND(u);
9740 data = PyUnicode_DATA(u);
9741 if (left)
9742 FILL(kind, data, fill, 0, left);
9743 if (right)
9744 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009745 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009746 assert(_PyUnicode_CheckConsistency(u, 1));
9747 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750
Alexander Belopolsky40018472011-02-26 01:02:56 +00009751PyObject *
9752PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755
9756 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 switch(PyUnicode_KIND(string)) {
9761 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009762 if (PyUnicode_IS_ASCII(string))
9763 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009764 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009765 PyUnicode_GET_LENGTH(string), keepends);
9766 else
9767 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009768 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009769 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 break;
9771 case PyUnicode_2BYTE_KIND:
9772 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009773 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 PyUnicode_GET_LENGTH(string), keepends);
9775 break;
9776 case PyUnicode_4BYTE_KIND:
9777 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 PyUnicode_GET_LENGTH(string), keepends);
9780 break;
9781 default:
9782 assert(0);
9783 list = 0;
9784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785 Py_DECREF(string);
9786 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787}
9788
Alexander Belopolsky40018472011-02-26 01:02:56 +00009789static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009790split(PyObject *self,
9791 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009792 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 int kind1, kind2, kind;
9795 void *buf1, *buf2;
9796 Py_ssize_t len1, len2;
9797 PyObject* out;
9798
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009800 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 if (PyUnicode_READY(self) == -1)
9803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 if (substring == NULL)
9806 switch(PyUnicode_KIND(self)) {
9807 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009808 if (PyUnicode_IS_ASCII(self))
9809 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009810 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009811 PyUnicode_GET_LENGTH(self), maxcount
9812 );
9813 else
9814 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009815 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009816 PyUnicode_GET_LENGTH(self), maxcount
9817 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 case PyUnicode_2BYTE_KIND:
9819 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
9823 case PyUnicode_4BYTE_KIND:
9824 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 PyUnicode_GET_LENGTH(self), maxcount
9827 );
9828 default:
9829 assert(0);
9830 return NULL;
9831 }
9832
9833 if (PyUnicode_READY(substring) == -1)
9834 return NULL;
9835
9836 kind1 = PyUnicode_KIND(self);
9837 kind2 = PyUnicode_KIND(substring);
9838 kind = kind1 > kind2 ? kind1 : kind2;
9839 buf1 = PyUnicode_DATA(self);
9840 buf2 = PyUnicode_DATA(substring);
9841 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009842 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 if (!buf1)
9844 return NULL;
9845 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009846 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 if (!buf2) {
9848 if (kind1 != kind) PyMem_Free(buf1);
9849 return NULL;
9850 }
9851 len1 = PyUnicode_GET_LENGTH(self);
9852 len2 = PyUnicode_GET_LENGTH(substring);
9853
9854 switch(kind) {
9855 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009856 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9857 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009858 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009859 else
9860 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009861 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 break;
9863 case PyUnicode_2BYTE_KIND:
9864 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009865 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 break;
9867 case PyUnicode_4BYTE_KIND:
9868 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009869 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 break;
9871 default:
9872 out = NULL;
9873 }
9874 if (kind1 != kind)
9875 PyMem_Free(buf1);
9876 if (kind2 != kind)
9877 PyMem_Free(buf2);
9878 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879}
9880
Alexander Belopolsky40018472011-02-26 01:02:56 +00009881static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009882rsplit(PyObject *self,
9883 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009884 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 int kind1, kind2, kind;
9887 void *buf1, *buf2;
9888 Py_ssize_t len1, len2;
9889 PyObject* out;
9890
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009891 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009892 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 if (PyUnicode_READY(self) == -1)
9895 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (substring == NULL)
9898 switch(PyUnicode_KIND(self)) {
9899 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009900 if (PyUnicode_IS_ASCII(self))
9901 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009902 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009903 PyUnicode_GET_LENGTH(self), maxcount
9904 );
9905 else
9906 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009907 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009908 PyUnicode_GET_LENGTH(self), maxcount
9909 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 case PyUnicode_2BYTE_KIND:
9911 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 PyUnicode_GET_LENGTH(self), maxcount
9914 );
9915 case PyUnicode_4BYTE_KIND:
9916 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
9920 default:
9921 assert(0);
9922 return NULL;
9923 }
9924
9925 if (PyUnicode_READY(substring) == -1)
9926 return NULL;
9927
9928 kind1 = PyUnicode_KIND(self);
9929 kind2 = PyUnicode_KIND(substring);
9930 kind = kind1 > kind2 ? kind1 : kind2;
9931 buf1 = PyUnicode_DATA(self);
9932 buf2 = PyUnicode_DATA(substring);
9933 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009934 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 if (!buf1)
9936 return NULL;
9937 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009938 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 if (!buf2) {
9940 if (kind1 != kind) PyMem_Free(buf1);
9941 return NULL;
9942 }
9943 len1 = PyUnicode_GET_LENGTH(self);
9944 len2 = PyUnicode_GET_LENGTH(substring);
9945
9946 switch(kind) {
9947 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009948 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9949 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009950 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009951 else
9952 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009953 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 break;
9955 case PyUnicode_2BYTE_KIND:
9956 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009957 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 break;
9959 case PyUnicode_4BYTE_KIND:
9960 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009961 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 break;
9963 default:
9964 out = NULL;
9965 }
9966 if (kind1 != kind)
9967 PyMem_Free(buf1);
9968 if (kind2 != kind)
9969 PyMem_Free(buf2);
9970 return out;
9971}
9972
9973static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009974anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9975 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976{
9977 switch(kind) {
9978 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009979 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9980 return asciilib_find(buf1, len1, buf2, len2, offset);
9981 else
9982 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 case PyUnicode_2BYTE_KIND:
9984 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9985 case PyUnicode_4BYTE_KIND:
9986 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9987 }
9988 assert(0);
9989 return -1;
9990}
9991
9992static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009993anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9994 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995{
9996 switch(kind) {
9997 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009998 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9999 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10000 else
10001 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 case PyUnicode_2BYTE_KIND:
10003 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10004 case PyUnicode_4BYTE_KIND:
10005 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10006 }
10007 assert(0);
10008 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010009}
10010
Alexander Belopolsky40018472011-02-26 01:02:56 +000010011static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012replace(PyObject *self, PyObject *str1,
10013 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 PyObject *u;
10016 char *sbuf = PyUnicode_DATA(self);
10017 char *buf1 = PyUnicode_DATA(str1);
10018 char *buf2 = PyUnicode_DATA(str2);
10019 int srelease = 0, release1 = 0, release2 = 0;
10020 int skind = PyUnicode_KIND(self);
10021 int kind1 = PyUnicode_KIND(str1);
10022 int kind2 = PyUnicode_KIND(str2);
10023 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10024 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10025 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010026 int mayshrink;
10027 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028
10029 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010032 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Victor Stinner59de0ee2011-10-07 10:01:28 +020010034 if (str1 == str2)
10035 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 if (skind < kind1)
10037 /* substring too wide to be present */
10038 goto nothing;
10039
Victor Stinner49a0a212011-10-12 23:46:10 +020010040 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10041 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10042 /* Replacing str1 with str2 may cause a maxchar reduction in the
10043 result string. */
10044 mayshrink = (maxchar_str2 < maxchar);
10045 maxchar = Py_MAX(maxchar, maxchar_str2);
10046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010048 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010049 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010051 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010053 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010054 Py_UCS4 u1, u2;
10055 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010057 if (findchar(sbuf, PyUnicode_KIND(self),
10058 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010059 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010064 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 rkind = PyUnicode_KIND(u);
10066 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10067 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010068 if (--maxcount < 0)
10069 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010071 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010072 }
10073 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 int rkind = skind;
10075 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (kind1 < rkind) {
10078 /* widen substring */
10079 buf1 = _PyUnicode_AsKind(str1, rkind);
10080 if (!buf1) goto error;
10081 release1 = 1;
10082 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010083 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010084 if (i < 0)
10085 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 if (rkind > kind2) {
10087 /* widen replacement */
10088 buf2 = _PyUnicode_AsKind(str2, rkind);
10089 if (!buf2) goto error;
10090 release2 = 1;
10091 }
10092 else if (rkind < kind2) {
10093 /* widen self and buf1 */
10094 rkind = kind2;
10095 if (release1) PyMem_Free(buf1);
10096 sbuf = _PyUnicode_AsKind(self, rkind);
10097 if (!sbuf) goto error;
10098 srelease = 1;
10099 buf1 = _PyUnicode_AsKind(str1, rkind);
10100 if (!buf1) goto error;
10101 release1 = 1;
10102 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010103 u = PyUnicode_New(slen, maxchar);
10104 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010106 assert(PyUnicode_KIND(u) == rkind);
10107 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010108
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010109 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010110 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010111 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010113 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010115
10116 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010117 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010118 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010120 if (i == -1)
10121 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010122 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010124 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010126 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010128 }
10129 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 Py_ssize_t n, i, j, ires;
10131 Py_ssize_t product, new_size;
10132 int rkind = skind;
10133 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010136 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 buf1 = _PyUnicode_AsKind(str1, rkind);
10138 if (!buf1) goto error;
10139 release1 = 1;
10140 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010141 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010142 if (n == 0)
10143 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010145 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 buf2 = _PyUnicode_AsKind(str2, rkind);
10147 if (!buf2) goto error;
10148 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010151 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 rkind = kind2;
10153 sbuf = _PyUnicode_AsKind(self, rkind);
10154 if (!sbuf) goto error;
10155 srelease = 1;
10156 if (release1) PyMem_Free(buf1);
10157 buf1 = _PyUnicode_AsKind(str1, rkind);
10158 if (!buf1) goto error;
10159 release1 = 1;
10160 }
10161 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10162 PyUnicode_GET_LENGTH(str1))); */
10163 product = n * (len2-len1);
10164 if ((product / (len2-len1)) != n) {
10165 PyErr_SetString(PyExc_OverflowError,
10166 "replace string is too long");
10167 goto error;
10168 }
10169 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010170 if (new_size == 0) {
10171 Py_INCREF(unicode_empty);
10172 u = unicode_empty;
10173 goto done;
10174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10176 PyErr_SetString(PyExc_OverflowError,
10177 "replace string is too long");
10178 goto error;
10179 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010180 u = PyUnicode_New(new_size, maxchar);
10181 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010183 assert(PyUnicode_KIND(u) == rkind);
10184 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 ires = i = 0;
10186 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010187 while (n-- > 0) {
10188 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010189 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010190 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010191 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010192 if (j == -1)
10193 break;
10194 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010195 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010196 memcpy(res + rkind * ires,
10197 sbuf + rkind * i,
10198 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010200 }
10201 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010203 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010205 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010211 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010212 memcpy(res + rkind * ires,
10213 sbuf + rkind * i,
10214 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010215 }
10216 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010217 /* interleave */
10218 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010219 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010221 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010223 if (--n <= 0)
10224 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010225 memcpy(res + rkind * ires,
10226 sbuf + rkind * i,
10227 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 ires++;
10229 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010231 memcpy(res + rkind * ires,
10232 sbuf + rkind * i,
10233 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010235 }
10236
10237 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010238 unicode_adjust_maxchar(&u);
10239 if (u == NULL)
10240 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010242
10243 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (srelease)
10245 PyMem_FREE(sbuf);
10246 if (release1)
10247 PyMem_FREE(buf1);
10248 if (release2)
10249 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010250 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010252
Benjamin Peterson29060642009-01-31 22:14:21 +000010253 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010254 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 if (srelease)
10256 PyMem_FREE(sbuf);
10257 if (release1)
10258 PyMem_FREE(buf1);
10259 if (release2)
10260 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010261 if (PyUnicode_CheckExact(self)) {
10262 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010264 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010265 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 error:
10267 if (srelease && sbuf)
10268 PyMem_FREE(sbuf);
10269 if (release1 && buf1)
10270 PyMem_FREE(buf1);
10271 if (release2 && buf2)
10272 PyMem_FREE(buf2);
10273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274}
10275
10276/* --- Unicode Object Methods --------------------------------------------- */
10277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010278PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010279 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280\n\
10281Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010282characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283
10284static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010285unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287 return fixup(self, fixtitle);
10288}
10289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010290PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292\n\
10293Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010294have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
10296static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010297unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299 return fixup(self, fixcapitalize);
10300}
10301
10302#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010303PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010304 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305\n\
10306Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010307normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308
10309static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010310unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311{
10312 PyObject *list;
10313 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010314 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316 /* Split into words */
10317 list = split(self, NULL, -1);
10318 if (!list)
10319 return NULL;
10320
10321 /* Capitalize each word */
10322 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010323 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010324 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325 if (item == NULL)
10326 goto onError;
10327 Py_DECREF(PyList_GET_ITEM(list, i));
10328 PyList_SET_ITEM(list, i, item);
10329 }
10330
10331 /* Join the words to form a new string */
10332 item = PyUnicode_Join(NULL, list);
10333
Benjamin Peterson29060642009-01-31 22:14:21 +000010334 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337}
10338#endif
10339
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010340/* Argument converter. Coerces to a single unicode character */
10341
10342static int
10343convert_uc(PyObject *obj, void *addr)
10344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010346 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010347
Benjamin Peterson14339b62009-01-31 16:36:08 +000010348 uniobj = PyUnicode_FromObject(obj);
10349 if (uniobj == NULL) {
10350 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010351 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 return 0;
10353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010355 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010356 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010357 Py_DECREF(uniobj);
10358 return 0;
10359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 Py_DECREF(uniobj);
10362 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010363}
10364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010365PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010366 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010368Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010369done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
10371static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010372unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010374 Py_ssize_t marg, left;
10375 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 Py_UCS4 fillchar = ' ';
10377
Victor Stinnere9a29352011-10-01 02:14:59 +020010378 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
Victor Stinnere9a29352011-10-01 02:14:59 +020010381 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 return NULL;
10383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387 }
10388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390 left = marg / 2 + (marg & width & 1);
10391
Victor Stinner9310abb2011-10-05 00:59:23 +020010392 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393}
10394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395/* This function assumes that str1 and str2 are readied by the caller. */
10396
Marc-André Lemburge5034372000-08-08 08:04:29 +000010397static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010398unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 int kind1, kind2;
10401 void *data1, *data2;
10402 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 kind1 = PyUnicode_KIND(str1);
10405 kind2 = PyUnicode_KIND(str2);
10406 data1 = PyUnicode_DATA(str1);
10407 data2 = PyUnicode_DATA(str2);
10408 len1 = PyUnicode_GET_LENGTH(str1);
10409 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 for (i = 0; i < len1 && i < len2; ++i) {
10412 Py_UCS4 c1, c2;
10413 c1 = PyUnicode_READ(kind1, data1, i);
10414 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010415
10416 if (c1 != c2)
10417 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010418 }
10419
10420 return (len1 < len2) ? -1 : (len1 != len2);
10421}
10422
Alexander Belopolsky40018472011-02-26 01:02:56 +000010423int
10424PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10427 if (PyUnicode_READY(left) == -1 ||
10428 PyUnicode_READY(right) == -1)
10429 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010430 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010432 PyErr_Format(PyExc_TypeError,
10433 "Can't compare %.100s and %.100s",
10434 left->ob_type->tp_name,
10435 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 return -1;
10437}
10438
Martin v. Löwis5b222132007-06-10 09:51:05 +000010439int
10440PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 Py_ssize_t i;
10443 int kind;
10444 void *data;
10445 Py_UCS4 chr;
10446
Victor Stinner910337b2011-10-03 03:20:16 +020010447 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (PyUnicode_READY(uni) == -1)
10449 return -1;
10450 kind = PyUnicode_KIND(uni);
10451 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010452 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10454 if (chr != str[i])
10455 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010456 /* This check keeps Python strings that end in '\0' from comparing equal
10457 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010460 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010462 return 0;
10463}
10464
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010465
Benjamin Peterson29060642009-01-31 22:14:21 +000010466#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010467 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010468
Alexander Belopolsky40018472011-02-26 01:02:56 +000010469PyObject *
10470PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010471{
10472 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010473
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010474 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10475 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (PyUnicode_READY(left) == -1 ||
10477 PyUnicode_READY(right) == -1)
10478 return NULL;
10479 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10480 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010481 if (op == Py_EQ) {
10482 Py_INCREF(Py_False);
10483 return Py_False;
10484 }
10485 if (op == Py_NE) {
10486 Py_INCREF(Py_True);
10487 return Py_True;
10488 }
10489 }
10490 if (left == right)
10491 result = 0;
10492 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010493 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010494
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010495 /* Convert the return value to a Boolean */
10496 switch (op) {
10497 case Py_EQ:
10498 v = TEST_COND(result == 0);
10499 break;
10500 case Py_NE:
10501 v = TEST_COND(result != 0);
10502 break;
10503 case Py_LE:
10504 v = TEST_COND(result <= 0);
10505 break;
10506 case Py_GE:
10507 v = TEST_COND(result >= 0);
10508 break;
10509 case Py_LT:
10510 v = TEST_COND(result == -1);
10511 break;
10512 case Py_GT:
10513 v = TEST_COND(result == 1);
10514 break;
10515 default:
10516 PyErr_BadArgument();
10517 return NULL;
10518 }
10519 Py_INCREF(v);
10520 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010521 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010522
Brian Curtindfc80e32011-08-10 20:28:54 -050010523 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010524}
10525
Alexander Belopolsky40018472011-02-26 01:02:56 +000010526int
10527PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010528{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010529 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 int kind1, kind2, kind;
10531 void *buf1, *buf2;
10532 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010533 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010534
10535 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010536 sub = PyUnicode_FromObject(element);
10537 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 PyErr_Format(PyExc_TypeError,
10539 "'in <string>' requires string as left operand, not %s",
10540 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (PyUnicode_READY(sub) == -1)
10544 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010545
Thomas Wouters477c8d52006-05-27 19:21:47 +000010546 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010547 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010548 Py_DECREF(sub);
10549 return -1;
10550 }
10551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 kind1 = PyUnicode_KIND(str);
10553 kind2 = PyUnicode_KIND(sub);
10554 kind = kind1 > kind2 ? kind1 : kind2;
10555 buf1 = PyUnicode_DATA(str);
10556 buf2 = PyUnicode_DATA(sub);
10557 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010558 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 if (!buf1) {
10560 Py_DECREF(sub);
10561 return -1;
10562 }
10563 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010564 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 if (!buf2) {
10566 Py_DECREF(sub);
10567 if (kind1 != kind) PyMem_Free(buf1);
10568 return -1;
10569 }
10570 len1 = PyUnicode_GET_LENGTH(str);
10571 len2 = PyUnicode_GET_LENGTH(sub);
10572
10573 switch(kind) {
10574 case PyUnicode_1BYTE_KIND:
10575 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10576 break;
10577 case PyUnicode_2BYTE_KIND:
10578 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10579 break;
10580 case PyUnicode_4BYTE_KIND:
10581 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10582 break;
10583 default:
10584 result = -1;
10585 assert(0);
10586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010587
10588 Py_DECREF(str);
10589 Py_DECREF(sub);
10590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 if (kind1 != kind)
10592 PyMem_Free(buf1);
10593 if (kind2 != kind)
10594 PyMem_Free(buf2);
10595
Guido van Rossum403d68b2000-03-13 15:55:09 +000010596 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010597}
10598
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599/* Concat to string or Unicode object giving a new Unicode object. */
10600
Alexander Belopolsky40018472011-02-26 01:02:56 +000010601PyObject *
10602PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010605 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606
10607 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010610 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010613 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614
10615 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010616 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010617 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010620 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010621 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623 }
10624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010626 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10627 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 w = PyUnicode_New(
10631 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10632 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010634 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010635 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10636 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 Py_DECREF(u);
10638 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010639 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641
Benjamin Peterson29060642009-01-31 22:14:21 +000010642 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 Py_XDECREF(u);
10644 Py_XDECREF(v);
10645 return NULL;
10646}
10647
Victor Stinnerb0923652011-10-04 01:17:31 +020010648static void
10649unicode_append_inplace(PyObject **p_left, PyObject *right)
10650{
10651 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010652
10653 assert(PyUnicode_IS_READY(*p_left));
10654 assert(PyUnicode_IS_READY(right));
10655
10656 left_len = PyUnicode_GET_LENGTH(*p_left);
10657 right_len = PyUnicode_GET_LENGTH(right);
10658 if (left_len > PY_SSIZE_T_MAX - right_len) {
10659 PyErr_SetString(PyExc_OverflowError,
10660 "strings are too large to concat");
10661 goto error;
10662 }
10663 new_len = left_len + right_len;
10664
10665 /* Now we own the last reference to 'left', so we can resize it
10666 * in-place.
10667 */
10668 if (unicode_resize(p_left, new_len) != 0) {
10669 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10670 * deallocated so it cannot be put back into
10671 * 'variable'. The MemoryError is raised when there
10672 * is no value in 'variable', which might (very
10673 * remotely) be a cause of incompatibilities.
10674 */
10675 goto error;
10676 }
10677 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010678 copy_characters(*p_left, left_len, right, 0, right_len);
10679 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010680 return;
10681
10682error:
10683 Py_DECREF(*p_left);
10684 *p_left = NULL;
10685}
10686
Walter Dörwald1ab83302007-05-18 17:15:44 +000010687void
Victor Stinner23e56682011-10-03 03:54:37 +020010688PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010689{
Victor Stinner23e56682011-10-03 03:54:37 +020010690 PyObject *left, *res;
10691
10692 if (p_left == NULL) {
10693 if (!PyErr_Occurred())
10694 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010695 return;
10696 }
Victor Stinner23e56682011-10-03 03:54:37 +020010697 left = *p_left;
10698 if (right == NULL || !PyUnicode_Check(left)) {
10699 if (!PyErr_Occurred())
10700 PyErr_BadInternalCall();
10701 goto error;
10702 }
10703
Victor Stinnere1335c72011-10-04 20:53:03 +020010704 if (PyUnicode_READY(left))
10705 goto error;
10706 if (PyUnicode_READY(right))
10707 goto error;
10708
Victor Stinner23e56682011-10-03 03:54:37 +020010709 if (PyUnicode_CheckExact(left) && left != unicode_empty
10710 && PyUnicode_CheckExact(right) && right != unicode_empty
10711 && unicode_resizable(left)
10712 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10713 || _PyUnicode_WSTR(left) != NULL))
10714 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010715 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10716 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010717 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010718 not so different than duplicating the string. */
10719 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010720 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010721 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010722 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010723 return;
10724 }
10725 }
10726
10727 res = PyUnicode_Concat(left, right);
10728 if (res == NULL)
10729 goto error;
10730 Py_DECREF(left);
10731 *p_left = res;
10732 return;
10733
10734error:
10735 Py_DECREF(*p_left);
10736 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010737}
10738
10739void
10740PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10741{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010742 PyUnicode_Append(pleft, right);
10743 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010744}
10745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010746PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010749Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010750string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010751interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
10753static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010754unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010756 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010757 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010758 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 int kind1, kind2, kind;
10761 void *buf1, *buf2;
10762 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763
Jesus Ceaac451502011-04-20 17:09:23 +020010764 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10765 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 kind1 = PyUnicode_KIND(self);
10769 kind2 = PyUnicode_KIND(substring);
10770 kind = kind1 > kind2 ? kind1 : kind2;
10771 buf1 = PyUnicode_DATA(self);
10772 buf2 = PyUnicode_DATA(substring);
10773 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010774 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 if (!buf1) {
10776 Py_DECREF(substring);
10777 return NULL;
10778 }
10779 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010780 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 if (!buf2) {
10782 Py_DECREF(substring);
10783 if (kind1 != kind) PyMem_Free(buf1);
10784 return NULL;
10785 }
10786 len1 = PyUnicode_GET_LENGTH(self);
10787 len2 = PyUnicode_GET_LENGTH(substring);
10788
10789 ADJUST_INDICES(start, end, len1);
10790 switch(kind) {
10791 case PyUnicode_1BYTE_KIND:
10792 iresult = ucs1lib_count(
10793 ((Py_UCS1*)buf1) + start, end - start,
10794 buf2, len2, PY_SSIZE_T_MAX
10795 );
10796 break;
10797 case PyUnicode_2BYTE_KIND:
10798 iresult = ucs2lib_count(
10799 ((Py_UCS2*)buf1) + start, end - start,
10800 buf2, len2, PY_SSIZE_T_MAX
10801 );
10802 break;
10803 case PyUnicode_4BYTE_KIND:
10804 iresult = ucs4lib_count(
10805 ((Py_UCS4*)buf1) + start, end - start,
10806 buf2, len2, PY_SSIZE_T_MAX
10807 );
10808 break;
10809 default:
10810 assert(0); iresult = 0;
10811 }
10812
10813 result = PyLong_FromSsize_t(iresult);
10814
10815 if (kind1 != kind)
10816 PyMem_Free(buf1);
10817 if (kind2 != kind)
10818 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 return result;
10823}
10824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010825PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010826 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010828Encode S using the codec registered for encoding. Default encoding\n\
10829is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010830handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010831a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10832'xmlcharrefreplace' as well as any other name registered with\n\
10833codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
10835static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010836unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010838 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839 char *encoding = NULL;
10840 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010841
Benjamin Peterson308d6372009-09-18 21:42:35 +000010842 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10843 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010846}
10847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010848PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850\n\
10851Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010852If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853
10854static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010855unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010857 Py_ssize_t i, j, line_pos, src_len, incr;
10858 Py_UCS4 ch;
10859 PyObject *u;
10860 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010863 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864
10865 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867
Antoine Pitrou22425222011-10-04 19:10:51 +020010868 if (PyUnicode_READY(self) == -1)
10869 return NULL;
10870
Thomas Wouters7e474022000-07-16 12:04:32 +000010871 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 src_len = PyUnicode_GET_LENGTH(self);
10873 i = j = line_pos = 0;
10874 kind = PyUnicode_KIND(self);
10875 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010876 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 for (; i < src_len; i++) {
10878 ch = PyUnicode_READ(kind, src_data, i);
10879 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010880 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010882 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 goto overflow;
10885 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010886 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010891 goto overflow;
10892 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010894 if (ch == '\n' || ch == '\r')
10895 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010897 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010898 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010899 Py_INCREF(self);
10900 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010901 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010902
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010904 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905 if (!u)
10906 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010907 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910
Antoine Pitroue71d5742011-10-04 15:55:09 +020010911 for (; i < src_len; i++) {
10912 ch = PyUnicode_READ(kind, src_data, i);
10913 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010914 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010915 incr = tabsize - (line_pos % tabsize);
10916 line_pos += incr;
10917 while (incr--) {
10918 PyUnicode_WRITE(kind, dest_data, j, ' ');
10919 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010920 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010921 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010922 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010923 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010924 line_pos++;
10925 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010926 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010927 if (ch == '\n' || ch == '\r')
10928 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010930 }
10931 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010932 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010933
Antoine Pitroue71d5742011-10-04 15:55:09 +020010934 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010935 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937}
10938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010939PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941\n\
10942Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010943such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944arguments start and end are interpreted as in slice notation.\n\
10945\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010946Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
10948static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010951 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010952 Py_ssize_t start;
10953 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010954 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
Jesus Ceaac451502011-04-20 17:09:23 +020010956 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10957 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 if (PyUnicode_READY(self) == -1)
10961 return NULL;
10962 if (PyUnicode_READY(substring) == -1)
10963 return NULL;
10964
Victor Stinner7931d9a2011-11-04 00:22:48 +010010965 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
10967 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 if (result == -2)
10970 return NULL;
10971
Christian Heimes217cfd12007-12-02 14:31:20 +000010972 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973}
10974
10975static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010976unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010978 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10979 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982}
10983
Guido van Rossumc2504932007-09-18 19:42:40 +000010984/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010985 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010986static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010987unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Guido van Rossumc2504932007-09-18 19:42:40 +000010989 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010990 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (_PyUnicode_HASH(self) != -1)
10993 return _PyUnicode_HASH(self);
10994 if (PyUnicode_READY(self) == -1)
10995 return -1;
10996 len = PyUnicode_GET_LENGTH(self);
10997
10998 /* The hash function as a macro, gets expanded three times below. */
10999#define HASH(P) \
11000 x = (Py_uhash_t)*P << 7; \
11001 while (--len >= 0) \
11002 x = (1000003*x) ^ (Py_uhash_t)*P++;
11003
11004 switch (PyUnicode_KIND(self)) {
11005 case PyUnicode_1BYTE_KIND: {
11006 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11007 HASH(c);
11008 break;
11009 }
11010 case PyUnicode_2BYTE_KIND: {
11011 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11012 HASH(s);
11013 break;
11014 }
11015 default: {
11016 Py_UCS4 *l;
11017 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11018 "Impossible switch case in unicode_hash");
11019 l = PyUnicode_4BYTE_DATA(self);
11020 HASH(l);
11021 break;
11022 }
11023 }
11024 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11025
Guido van Rossumc2504932007-09-18 19:42:40 +000011026 if (x == -1)
11027 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011029 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011033PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011036Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037
11038static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011041 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011042 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011043 Py_ssize_t start;
11044 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
Jesus Ceaac451502011-04-20 17:09:23 +020011046 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11047 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 if (PyUnicode_READY(self) == -1)
11051 return NULL;
11052 if (PyUnicode_READY(substring) == -1)
11053 return NULL;
11054
Victor Stinner7931d9a2011-11-04 00:22:48 +010011055 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056
11057 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 if (result == -2)
11060 return NULL;
11061
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 if (result < 0) {
11063 PyErr_SetString(PyExc_ValueError, "substring not found");
11064 return NULL;
11065 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011066
Christian Heimes217cfd12007-12-02 14:31:20 +000011067 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068}
11069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011070PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011071 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011073Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011074at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
11076static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011077unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 Py_ssize_t i, length;
11080 int kind;
11081 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082 int cased;
11083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 if (PyUnicode_READY(self) == -1)
11085 return NULL;
11086 length = PyUnicode_GET_LENGTH(self);
11087 kind = PyUnicode_KIND(self);
11088 data = PyUnicode_DATA(self);
11089
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (length == 1)
11092 return PyBool_FromLong(
11093 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011095 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011097 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011098
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 for (i = 0; i < length; i++) {
11101 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011102
Benjamin Peterson29060642009-01-31 22:14:21 +000011103 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11104 return PyBool_FromLong(0);
11105 else if (!cased && Py_UNICODE_ISLOWER(ch))
11106 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011108 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109}
11110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011111PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011112 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011114Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011115at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116
11117static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011118unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 Py_ssize_t i, length;
11121 int kind;
11122 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123 int cased;
11124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 if (PyUnicode_READY(self) == -1)
11126 return NULL;
11127 length = PyUnicode_GET_LENGTH(self);
11128 kind = PyUnicode_KIND(self);
11129 data = PyUnicode_DATA(self);
11130
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 if (length == 1)
11133 return PyBool_FromLong(
11134 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011136 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011137 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011138 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011139
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 for (i = 0; i < length; i++) {
11142 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011143
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11145 return PyBool_FromLong(0);
11146 else if (!cased && Py_UNICODE_ISUPPER(ch))
11147 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011149 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150}
11151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011152PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011153 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011155Return True if S is a titlecased string and there is at least one\n\
11156character in S, i.e. upper- and titlecase characters may only\n\
11157follow uncased characters and lowercase characters only cased ones.\n\
11158Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
11160static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011161unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 Py_ssize_t i, length;
11164 int kind;
11165 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166 int cased, previous_is_cased;
11167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 if (PyUnicode_READY(self) == -1)
11169 return NULL;
11170 length = PyUnicode_GET_LENGTH(self);
11171 kind = PyUnicode_KIND(self);
11172 data = PyUnicode_DATA(self);
11173
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 if (length == 1) {
11176 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11177 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11178 (Py_UNICODE_ISUPPER(ch) != 0));
11179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011181 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011184
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 cased = 0;
11186 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 for (i = 0; i < length; i++) {
11188 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011189
Benjamin Peterson29060642009-01-31 22:14:21 +000011190 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11191 if (previous_is_cased)
11192 return PyBool_FromLong(0);
11193 previous_is_cased = 1;
11194 cased = 1;
11195 }
11196 else if (Py_UNICODE_ISLOWER(ch)) {
11197 if (!previous_is_cased)
11198 return PyBool_FromLong(0);
11199 previous_is_cased = 1;
11200 cased = 1;
11201 }
11202 else
11203 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011205 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206}
11207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011208PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011211Return True if all characters in S are whitespace\n\
11212and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213
11214static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011215unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 Py_ssize_t i, length;
11218 int kind;
11219 void *data;
11220
11221 if (PyUnicode_READY(self) == -1)
11222 return NULL;
11223 length = PyUnicode_GET_LENGTH(self);
11224 kind = PyUnicode_KIND(self);
11225 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 if (length == 1)
11229 return PyBool_FromLong(
11230 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011232 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 for (i = 0; i < length; i++) {
11237 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011238 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011239 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011241 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242}
11243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011244PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011245 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011246\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011247Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011248and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011249
11250static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011251unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 Py_ssize_t i, length;
11254 int kind;
11255 void *data;
11256
11257 if (PyUnicode_READY(self) == -1)
11258 return NULL;
11259 length = PyUnicode_GET_LENGTH(self);
11260 kind = PyUnicode_KIND(self);
11261 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011262
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011263 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 if (length == 1)
11265 return PyBool_FromLong(
11266 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011267
11268 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 for (i = 0; i < length; i++) {
11273 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011275 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011276 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011277}
11278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011279PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011281\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011282Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011283and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011284
11285static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011286unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 int kind;
11289 void *data;
11290 Py_ssize_t len, i;
11291
11292 if (PyUnicode_READY(self) == -1)
11293 return NULL;
11294
11295 kind = PyUnicode_KIND(self);
11296 data = PyUnicode_DATA(self);
11297 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011298
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 if (len == 1) {
11301 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11302 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11303 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011304
11305 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 for (i = 0; i < len; i++) {
11310 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011311 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011313 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011314 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011315}
11316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011317PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011318 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011320Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011321False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
11323static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011324unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 Py_ssize_t i, length;
11327 int kind;
11328 void *data;
11329
11330 if (PyUnicode_READY(self) == -1)
11331 return NULL;
11332 length = PyUnicode_GET_LENGTH(self);
11333 kind = PyUnicode_KIND(self);
11334 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 if (length == 1)
11338 return PyBool_FromLong(
11339 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011341 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011343 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 for (i = 0; i < length; i++) {
11346 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011349 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350}
11351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011352PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011355Return True if all characters in S are digits\n\
11356and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
11358static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011359unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 Py_ssize_t i, length;
11362 int kind;
11363 void *data;
11364
11365 if (PyUnicode_READY(self) == -1)
11366 return NULL;
11367 length = PyUnicode_GET_LENGTH(self);
11368 kind = PyUnicode_KIND(self);
11369 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 if (length == 1) {
11373 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11374 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11375 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011377 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 for (i = 0; i < length; i++) {
11382 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011383 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011385 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386}
11387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011391Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011392False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393
11394static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011395unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 Py_ssize_t i, length;
11398 int kind;
11399 void *data;
11400
11401 if (PyUnicode_READY(self) == -1)
11402 return NULL;
11403 length = PyUnicode_GET_LENGTH(self);
11404 kind = PyUnicode_KIND(self);
11405 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 if (length == 1)
11409 return PyBool_FromLong(
11410 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011412 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 for (i = 0; i < length; i++) {
11417 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011420 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421}
11422
Martin v. Löwis47383402007-08-15 07:32:56 +000011423int
11424PyUnicode_IsIdentifier(PyObject *self)
11425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 int kind;
11427 void *data;
11428 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011429 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (PyUnicode_READY(self) == -1) {
11432 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 }
11435
11436 /* Special case for empty strings */
11437 if (PyUnicode_GET_LENGTH(self) == 0)
11438 return 0;
11439 kind = PyUnicode_KIND(self);
11440 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011441
11442 /* PEP 3131 says that the first character must be in
11443 XID_Start and subsequent characters in XID_Continue,
11444 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011445 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011446 letters, digits, underscore). However, given the current
11447 definition of XID_Start and XID_Continue, it is sufficient
11448 to check just for these, except that _ must be allowed
11449 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011451 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011452 return 0;
11453
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011454 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011457 return 1;
11458}
11459
11460PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011462\n\
11463Return True if S is a valid identifier according\n\
11464to the language definition.");
11465
11466static PyObject*
11467unicode_isidentifier(PyObject *self)
11468{
11469 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11470}
11471
Georg Brandl559e5d72008-06-11 18:37:52 +000011472PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011474\n\
11475Return True if all characters in S are considered\n\
11476printable in repr() or S is empty, False otherwise.");
11477
11478static PyObject*
11479unicode_isprintable(PyObject *self)
11480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 Py_ssize_t i, length;
11482 int kind;
11483 void *data;
11484
11485 if (PyUnicode_READY(self) == -1)
11486 return NULL;
11487 length = PyUnicode_GET_LENGTH(self);
11488 kind = PyUnicode_KIND(self);
11489 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011490
11491 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 if (length == 1)
11493 return PyBool_FromLong(
11494 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 for (i = 0; i < length; i++) {
11497 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011498 Py_RETURN_FALSE;
11499 }
11500 }
11501 Py_RETURN_TRUE;
11502}
11503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011504PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011505 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506\n\
11507Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011508iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
11510static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011511unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011513 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514}
11515
Martin v. Löwis18e16552006-02-15 17:27:45 +000011516static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011517unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 if (PyUnicode_READY(self) == -1)
11520 return -1;
11521 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522}
11523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011524PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011527Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011528done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529
11530static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011531unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011533 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 Py_UCS4 fillchar = ' ';
11535
11536 if (PyUnicode_READY(self) == -1)
11537 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011538
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011539 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540 return NULL;
11541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011544 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 }
11546
Victor Stinner7931d9a2011-11-04 00:22:48 +010011547 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548}
11549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011550PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011553Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
11555static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011556unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 return fixup(self, fixlower);
11559}
11560
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011561#define LEFTSTRIP 0
11562#define RIGHTSTRIP 1
11563#define BOTHSTRIP 2
11564
11565/* Arrays indexed by above */
11566static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11567
11568#define STRIPNAME(i) (stripformat[i]+3)
11569
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011570/* externally visible for str.strip(unicode) */
11571PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011572_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011573{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 void *data;
11575 int kind;
11576 Py_ssize_t i, j, len;
11577 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11580 return NULL;
11581
11582 kind = PyUnicode_KIND(self);
11583 data = PyUnicode_DATA(self);
11584 len = PyUnicode_GET_LENGTH(self);
11585 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11586 PyUnicode_DATA(sepobj),
11587 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011588
Benjamin Peterson14339b62009-01-31 16:36:08 +000011589 i = 0;
11590 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 while (i < len &&
11592 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 i++;
11594 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011595 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011596
Benjamin Peterson14339b62009-01-31 16:36:08 +000011597 j = len;
11598 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 do {
11600 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 } while (j >= i &&
11602 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011604 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011605
Victor Stinner7931d9a2011-11-04 00:22:48 +010011606 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607}
11608
11609PyObject*
11610PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11611{
11612 unsigned char *data;
11613 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011614 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615
Victor Stinnerde636f32011-10-01 03:55:54 +020011616 if (PyUnicode_READY(self) == -1)
11617 return NULL;
11618
11619 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11620
Victor Stinner12bab6d2011-10-01 01:53:49 +020011621 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011623 if (PyUnicode_CheckExact(self)) {
11624 Py_INCREF(self);
11625 return self;
11626 }
11627 else
11628 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 }
11630
Victor Stinner12bab6d2011-10-01 01:53:49 +020011631 length = end - start;
11632 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011633 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634
Victor Stinnerde636f32011-10-01 03:55:54 +020011635 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011636 PyErr_SetString(PyExc_IndexError, "string index out of range");
11637 return NULL;
11638 }
11639
Victor Stinnerb9275c12011-10-05 14:01:42 +020011640 if (PyUnicode_IS_ASCII(self)) {
11641 kind = PyUnicode_KIND(self);
11642 data = PyUnicode_1BYTE_DATA(self);
11643 return unicode_fromascii(data + start, length);
11644 }
11645 else {
11646 kind = PyUnicode_KIND(self);
11647 data = PyUnicode_1BYTE_DATA(self);
11648 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011649 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011650 length);
11651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653
11654static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011655do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 int kind;
11658 void *data;
11659 Py_ssize_t len, i, j;
11660
11661 if (PyUnicode_READY(self) == -1)
11662 return NULL;
11663
11664 kind = PyUnicode_KIND(self);
11665 data = PyUnicode_DATA(self);
11666 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011667
Benjamin Peterson14339b62009-01-31 16:36:08 +000011668 i = 0;
11669 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011671 i++;
11672 }
11673 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011674
Benjamin Peterson14339b62009-01-31 16:36:08 +000011675 j = len;
11676 if (striptype != LEFTSTRIP) {
11677 do {
11678 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011680 j++;
11681 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011682
Victor Stinner7931d9a2011-11-04 00:22:48 +010011683 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684}
11685
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011686
11687static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011688do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011689{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011690 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011691
Benjamin Peterson14339b62009-01-31 16:36:08 +000011692 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11693 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694
Benjamin Peterson14339b62009-01-31 16:36:08 +000011695 if (sep != NULL && sep != Py_None) {
11696 if (PyUnicode_Check(sep))
11697 return _PyUnicode_XStrip(self, striptype, sep);
11698 else {
11699 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 "%s arg must be None or str",
11701 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 return NULL;
11703 }
11704 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011705
Benjamin Peterson14339b62009-01-31 16:36:08 +000011706 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011707}
11708
11709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011710PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712\n\
11713Return a copy of the string S with leading and trailing\n\
11714whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011715If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716
11717static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011718unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011719{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011720 if (PyTuple_GET_SIZE(args) == 0)
11721 return do_strip(self, BOTHSTRIP); /* Common case */
11722 else
11723 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011724}
11725
11726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011727PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011728 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011729\n\
11730Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011731If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011732
11733static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011734unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011735{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 if (PyTuple_GET_SIZE(args) == 0)
11737 return do_strip(self, LEFTSTRIP); /* Common case */
11738 else
11739 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011740}
11741
11742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011743PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745\n\
11746Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011747If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011748
11749static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011750unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011751{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011752 if (PyTuple_GET_SIZE(args) == 0)
11753 return do_strip(self, RIGHTSTRIP); /* Common case */
11754 else
11755 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011756}
11757
11758
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011760unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011762 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
Georg Brandl222de0f2009-04-12 12:01:50 +000011765 if (len < 1) {
11766 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011767 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769
Tim Peters7a29bd52001-09-12 03:03:31 +000011770 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 /* no repeat, return original string */
11772 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011773 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 }
Tim Peters8f422462000-09-09 06:13:41 +000011775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (PyUnicode_READY(str) == -1)
11777 return NULL;
11778
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011779 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011780 PyErr_SetString(PyExc_OverflowError,
11781 "repeated string is too long");
11782 return NULL;
11783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011785
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 if (!u)
11788 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011789 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 if (PyUnicode_GET_LENGTH(str) == 1) {
11792 const int kind = PyUnicode_KIND(str);
11793 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11794 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011795 if (kind == PyUnicode_1BYTE_KIND)
11796 memset(to, (unsigned char)fill_char, len);
11797 else {
11798 for (n = 0; n < len; ++n)
11799 PyUnicode_WRITE(kind, to, n, fill_char);
11800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 }
11802 else {
11803 /* number of characters copied this far */
11804 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011805 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 char *to = (char *) PyUnicode_DATA(u);
11807 Py_MEMCPY(to, PyUnicode_DATA(str),
11808 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011809 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 n = (done <= nchars-done) ? done : nchars-done;
11811 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011812 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814 }
11815
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011816 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011817 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818}
11819
Alexander Belopolsky40018472011-02-26 01:02:56 +000011820PyObject *
11821PyUnicode_Replace(PyObject *obj,
11822 PyObject *subobj,
11823 PyObject *replobj,
11824 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825{
11826 PyObject *self;
11827 PyObject *str1;
11828 PyObject *str2;
11829 PyObject *result;
11830
11831 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011832 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011835 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 Py_DECREF(self);
11837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838 }
11839 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011840 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 Py_DECREF(self);
11842 Py_DECREF(str1);
11843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 Py_DECREF(self);
11847 Py_DECREF(str1);
11848 Py_DECREF(str2);
11849 return result;
11850}
11851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011852PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011853 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854\n\
11855Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011856old replaced by new. If the optional argument count is\n\
11857given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
11859static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 PyObject *str1;
11863 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011864 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 PyObject *result;
11866
Martin v. Löwis18e16552006-02-15 17:27:45 +000011867 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 str1 = PyUnicode_FromObject(str1);
11872 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11873 return NULL;
11874 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011875 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 Py_DECREF(str1);
11877 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
11880 result = replace(self, str1, str2, maxcount);
11881
11882 Py_DECREF(str1);
11883 Py_DECREF(str2);
11884 return result;
11885}
11886
Alexander Belopolsky40018472011-02-26 01:02:56 +000011887static PyObject *
11888unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011890 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 Py_ssize_t isize;
11892 Py_ssize_t osize, squote, dquote, i, o;
11893 Py_UCS4 max, quote;
11894 int ikind, okind;
11895 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011898 return NULL;
11899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 isize = PyUnicode_GET_LENGTH(unicode);
11901 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 /* Compute length of output, quote characters, and
11904 maximum character */
11905 osize = 2; /* quotes */
11906 max = 127;
11907 squote = dquote = 0;
11908 ikind = PyUnicode_KIND(unicode);
11909 for (i = 0; i < isize; i++) {
11910 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11911 switch (ch) {
11912 case '\'': squote++; osize++; break;
11913 case '"': dquote++; osize++; break;
11914 case '\\': case '\t': case '\r': case '\n':
11915 osize += 2; break;
11916 default:
11917 /* Fast-path ASCII */
11918 if (ch < ' ' || ch == 0x7f)
11919 osize += 4; /* \xHH */
11920 else if (ch < 0x7f)
11921 osize++;
11922 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11923 osize++;
11924 max = ch > max ? ch : max;
11925 }
11926 else if (ch < 0x100)
11927 osize += 4; /* \xHH */
11928 else if (ch < 0x10000)
11929 osize += 6; /* \uHHHH */
11930 else
11931 osize += 10; /* \uHHHHHHHH */
11932 }
11933 }
11934
11935 quote = '\'';
11936 if (squote) {
11937 if (dquote)
11938 /* Both squote and dquote present. Use squote,
11939 and escape them */
11940 osize += squote;
11941 else
11942 quote = '"';
11943 }
11944
11945 repr = PyUnicode_New(osize, max);
11946 if (repr == NULL)
11947 return NULL;
11948 okind = PyUnicode_KIND(repr);
11949 odata = PyUnicode_DATA(repr);
11950
11951 PyUnicode_WRITE(okind, odata, 0, quote);
11952 PyUnicode_WRITE(okind, odata, osize-1, quote);
11953
11954 for (i = 0, o = 1; i < isize; i++) {
11955 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011956
11957 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 if ((ch == quote) || (ch == '\\')) {
11959 PyUnicode_WRITE(okind, odata, o++, '\\');
11960 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011961 continue;
11962 }
11963
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011965 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 PyUnicode_WRITE(okind, odata, o++, '\\');
11967 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011968 }
11969 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 PyUnicode_WRITE(okind, odata, o++, '\\');
11971 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011972 }
11973 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 PyUnicode_WRITE(okind, odata, o++, '\\');
11975 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011976 }
11977
11978 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011979 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 PyUnicode_WRITE(okind, odata, o++, '\\');
11981 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011982 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11983 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011984 }
11985
Georg Brandl559e5d72008-06-11 18:37:52 +000011986 /* Copy ASCII characters as-is */
11987 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011989 }
11990
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011992 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011993 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011994 (categories Z* and C* except ASCII space)
11995 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011997 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 if (ch <= 0xff) {
11999 PyUnicode_WRITE(okind, odata, o++, '\\');
12000 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012001 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12002 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012003 }
12004 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 else if (ch >= 0x10000) {
12006 PyUnicode_WRITE(okind, odata, o++, '\\');
12007 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012008 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12009 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12010 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12011 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12012 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12013 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12014 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12015 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012016 }
12017 /* Map 16-bit characters to '\uxxxx' */
12018 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 PyUnicode_WRITE(okind, odata, o++, '\\');
12020 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012021 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12022 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12023 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012025 }
12026 }
12027 /* Copy characters as-is */
12028 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012030 }
12031 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012034 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012035 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036}
12037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012038PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012039 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040\n\
12041Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012042such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043arguments start and end are interpreted as in slice notation.\n\
12044\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012045Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
12047static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012050 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012051 Py_ssize_t start;
12052 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012053 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
Jesus Ceaac451502011-04-20 17:09:23 +020012055 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12056 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 if (PyUnicode_READY(self) == -1)
12060 return NULL;
12061 if (PyUnicode_READY(substring) == -1)
12062 return NULL;
12063
Victor Stinner7931d9a2011-11-04 00:22:48 +010012064 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
12066 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 if (result == -2)
12069 return NULL;
12070
Christian Heimes217cfd12007-12-02 14:31:20 +000012071 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072}
12073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012074PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012077Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078
12079static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012082 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012083 Py_ssize_t start;
12084 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012085 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
Jesus Ceaac451502011-04-20 17:09:23 +020012087 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12088 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 if (PyUnicode_READY(self) == -1)
12092 return NULL;
12093 if (PyUnicode_READY(substring) == -1)
12094 return NULL;
12095
Victor Stinner7931d9a2011-11-04 00:22:48 +010012096 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
12098 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (result == -2)
12101 return NULL;
12102
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 if (result < 0) {
12104 PyErr_SetString(PyExc_ValueError, "substring not found");
12105 return NULL;
12106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107
Christian Heimes217cfd12007-12-02 14:31:20 +000012108 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109}
12110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012111PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012114Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012115done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
12117static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012118unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012120 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 Py_UCS4 fillchar = ' ';
12122
Victor Stinnere9a29352011-10-01 02:14:59 +020012123 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012125
Victor Stinnere9a29352011-10-01 02:14:59 +020012126 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127 return NULL;
12128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012131 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132 }
12133
Victor Stinner7931d9a2011-11-04 00:22:48 +010012134 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135}
12136
Alexander Belopolsky40018472011-02-26 01:02:56 +000012137PyObject *
12138PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139{
12140 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012141
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142 s = PyUnicode_FromObject(s);
12143 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012144 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 if (sep != NULL) {
12146 sep = PyUnicode_FromObject(sep);
12147 if (sep == NULL) {
12148 Py_DECREF(s);
12149 return NULL;
12150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151 }
12152
Victor Stinner9310abb2011-10-05 00:59:23 +020012153 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
12155 Py_DECREF(s);
12156 Py_XDECREF(sep);
12157 return result;
12158}
12159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012160PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012161 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162\n\
12163Return a list of the words in S, using sep as the\n\
12164delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012165splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012166whitespace string is a separator and empty strings are\n\
12167removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168
12169static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012170unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
12172 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012173 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
Martin v. Löwis18e16552006-02-15 17:27:45 +000012175 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176 return NULL;
12177
12178 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012179 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012181 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012183 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184}
12185
Thomas Wouters477c8d52006-05-27 19:21:47 +000012186PyObject *
12187PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12188{
12189 PyObject* str_obj;
12190 PyObject* sep_obj;
12191 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 int kind1, kind2, kind;
12193 void *buf1 = NULL, *buf2 = NULL;
12194 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012195
12196 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012197 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012198 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012199 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012201 Py_DECREF(str_obj);
12202 return NULL;
12203 }
12204
Victor Stinner14f8f022011-10-05 20:58:25 +020012205 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012207 kind = Py_MAX(kind1, kind2);
12208 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012210 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 if (!buf1)
12212 goto onError;
12213 buf2 = PyUnicode_DATA(sep_obj);
12214 if (kind2 != kind)
12215 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12216 if (!buf2)
12217 goto onError;
12218 len1 = PyUnicode_GET_LENGTH(str_obj);
12219 len2 = PyUnicode_GET_LENGTH(sep_obj);
12220
Victor Stinner14f8f022011-10-05 20:58:25 +020012221 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012223 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12224 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12225 else
12226 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 break;
12228 case PyUnicode_2BYTE_KIND:
12229 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12230 break;
12231 case PyUnicode_4BYTE_KIND:
12232 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12233 break;
12234 default:
12235 assert(0);
12236 out = 0;
12237 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012238
12239 Py_DECREF(sep_obj);
12240 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 if (kind1 != kind)
12242 PyMem_Free(buf1);
12243 if (kind2 != kind)
12244 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012245
12246 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 onError:
12248 Py_DECREF(sep_obj);
12249 Py_DECREF(str_obj);
12250 if (kind1 != kind && buf1)
12251 PyMem_Free(buf1);
12252 if (kind2 != kind && buf2)
12253 PyMem_Free(buf2);
12254 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255}
12256
12257
12258PyObject *
12259PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12260{
12261 PyObject* str_obj;
12262 PyObject* sep_obj;
12263 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 int kind1, kind2, kind;
12265 void *buf1 = NULL, *buf2 = NULL;
12266 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267
12268 str_obj = PyUnicode_FromObject(str_in);
12269 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271 sep_obj = PyUnicode_FromObject(sep_in);
12272 if (!sep_obj) {
12273 Py_DECREF(str_obj);
12274 return NULL;
12275 }
12276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 kind1 = PyUnicode_KIND(str_in);
12278 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012279 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 buf1 = PyUnicode_DATA(str_in);
12281 if (kind1 != kind)
12282 buf1 = _PyUnicode_AsKind(str_in, kind);
12283 if (!buf1)
12284 goto onError;
12285 buf2 = PyUnicode_DATA(sep_obj);
12286 if (kind2 != kind)
12287 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12288 if (!buf2)
12289 goto onError;
12290 len1 = PyUnicode_GET_LENGTH(str_obj);
12291 len2 = PyUnicode_GET_LENGTH(sep_obj);
12292
12293 switch(PyUnicode_KIND(str_in)) {
12294 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012295 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12296 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12297 else
12298 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 break;
12300 case PyUnicode_2BYTE_KIND:
12301 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12302 break;
12303 case PyUnicode_4BYTE_KIND:
12304 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12305 break;
12306 default:
12307 assert(0);
12308 out = 0;
12309 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310
12311 Py_DECREF(sep_obj);
12312 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 if (kind1 != kind)
12314 PyMem_Free(buf1);
12315 if (kind2 != kind)
12316 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012317
12318 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 onError:
12320 Py_DECREF(sep_obj);
12321 Py_DECREF(str_obj);
12322 if (kind1 != kind && buf1)
12323 PyMem_Free(buf1);
12324 if (kind2 != kind && buf2)
12325 PyMem_Free(buf2);
12326 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012327}
12328
12329PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012331\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012332Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012333the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012334found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012335
12336static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012337unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012338{
Victor Stinner9310abb2011-10-05 00:59:23 +020012339 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012340}
12341
12342PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012343 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012344\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012345Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012346the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012347separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012348
12349static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012350unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012351{
Victor Stinner9310abb2011-10-05 00:59:23 +020012352 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012353}
12354
Alexander Belopolsky40018472011-02-26 01:02:56 +000012355PyObject *
12356PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012357{
12358 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012359
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012360 s = PyUnicode_FromObject(s);
12361 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012362 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 if (sep != NULL) {
12364 sep = PyUnicode_FromObject(sep);
12365 if (sep == NULL) {
12366 Py_DECREF(s);
12367 return NULL;
12368 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012369 }
12370
Victor Stinner9310abb2011-10-05 00:59:23 +020012371 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012372
12373 Py_DECREF(s);
12374 Py_XDECREF(sep);
12375 return result;
12376}
12377
12378PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012380\n\
12381Return a list of the words in S, using sep as the\n\
12382delimiter string, starting at the end of the string and\n\
12383working to the front. If maxsplit is given, at most maxsplit\n\
12384splits are done. If sep is not specified, any whitespace string\n\
12385is a separator.");
12386
12387static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012388unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012389{
12390 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012391 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012392
Martin v. Löwis18e16552006-02-15 17:27:45 +000012393 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012394 return NULL;
12395
12396 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012397 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012398 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012399 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012400 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012401 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012402}
12403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012404PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406\n\
12407Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012408Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012409is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410
12411static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012412unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012414 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012415 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012417 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12418 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419 return NULL;
12420
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012421 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422}
12423
12424static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012425PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426{
Walter Dörwald346737f2007-05-31 10:44:43 +000012427 if (PyUnicode_CheckExact(self)) {
12428 Py_INCREF(self);
12429 return self;
12430 } else
12431 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012432 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433}
12434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012435PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437\n\
12438Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012439and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440
12441static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012442unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444 return fixup(self, fixswapcase);
12445}
12446
Georg Brandlceee0772007-11-27 23:48:05 +000012447PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012448 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012449\n\
12450Return a translation table usable for str.translate().\n\
12451If there is only one argument, it must be a dictionary mapping Unicode\n\
12452ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012453Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012454If there are two arguments, they must be strings of equal length, and\n\
12455in the resulting dictionary, each character in x will be mapped to the\n\
12456character at the same position in y. If there is a third argument, it\n\
12457must be a string, whose characters will be mapped to None in the result.");
12458
12459static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012460unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012461{
12462 PyObject *x, *y = NULL, *z = NULL;
12463 PyObject *new = NULL, *key, *value;
12464 Py_ssize_t i = 0;
12465 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012466
Georg Brandlceee0772007-11-27 23:48:05 +000012467 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12468 return NULL;
12469 new = PyDict_New();
12470 if (!new)
12471 return NULL;
12472 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 int x_kind, y_kind, z_kind;
12474 void *x_data, *y_data, *z_data;
12475
Georg Brandlceee0772007-11-27 23:48:05 +000012476 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012477 if (!PyUnicode_Check(x)) {
12478 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12479 "be a string if there is a second argument");
12480 goto err;
12481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012483 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12484 "arguments must have equal length");
12485 goto err;
12486 }
12487 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 x_kind = PyUnicode_KIND(x);
12489 y_kind = PyUnicode_KIND(y);
12490 x_data = PyUnicode_DATA(x);
12491 y_data = PyUnicode_DATA(y);
12492 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12493 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12494 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012495 if (!key || !value)
12496 goto err;
12497 res = PyDict_SetItem(new, key, value);
12498 Py_DECREF(key);
12499 Py_DECREF(value);
12500 if (res < 0)
12501 goto err;
12502 }
12503 /* create entries for deleting chars in z */
12504 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 z_kind = PyUnicode_KIND(z);
12506 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012507 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012509 if (!key)
12510 goto err;
12511 res = PyDict_SetItem(new, key, Py_None);
12512 Py_DECREF(key);
12513 if (res < 0)
12514 goto err;
12515 }
12516 }
12517 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 int kind;
12519 void *data;
12520
Georg Brandlceee0772007-11-27 23:48:05 +000012521 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012522 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012523 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12524 "to maketrans it must be a dict");
12525 goto err;
12526 }
12527 /* copy entries into the new dict, converting string keys to int keys */
12528 while (PyDict_Next(x, &i, &key, &value)) {
12529 if (PyUnicode_Check(key)) {
12530 /* convert string keys to integer keys */
12531 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012532 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012533 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12534 "table must be of length 1");
12535 goto err;
12536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 kind = PyUnicode_KIND(key);
12538 data = PyUnicode_DATA(key);
12539 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012540 if (!newkey)
12541 goto err;
12542 res = PyDict_SetItem(new, newkey, value);
12543 Py_DECREF(newkey);
12544 if (res < 0)
12545 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012546 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012547 /* just keep integer keys */
12548 if (PyDict_SetItem(new, key, value) < 0)
12549 goto err;
12550 } else {
12551 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12552 "be strings or integers");
12553 goto err;
12554 }
12555 }
12556 }
12557 return new;
12558 err:
12559 Py_DECREF(new);
12560 return NULL;
12561}
12562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012563PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565\n\
12566Return a copy of the string S, where all characters have been mapped\n\
12567through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012568Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012569Unmapped characters are left untouched. Characters mapped to None\n\
12570are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571
12572static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576}
12577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012578PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012579 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012581Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582
12583static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012584unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586 return fixup(self, fixupper);
12587}
12588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012589PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012592Pad a numeric string S with zeros on the left, to fill a field\n\
12593of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594
12595static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012596unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012598 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012599 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012600 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 int kind;
12602 void *data;
12603 Py_UCS4 chr;
12604
12605 if (PyUnicode_READY(self) == -1)
12606 return NULL;
12607
Martin v. Löwis18e16552006-02-15 17:27:45 +000012608 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609 return NULL;
12610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012612 if (PyUnicode_CheckExact(self)) {
12613 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012614 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012615 }
12616 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012617 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618 }
12619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621
12622 u = pad(self, fill, 0, '0');
12623
Walter Dörwald068325e2002-04-15 13:36:47 +000012624 if (u == NULL)
12625 return NULL;
12626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 kind = PyUnicode_KIND(u);
12628 data = PyUnicode_DATA(u);
12629 chr = PyUnicode_READ(kind, data, fill);
12630
12631 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 PyUnicode_WRITE(kind, data, 0, chr);
12634 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635 }
12636
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012637 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012638 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640
12641#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012642static PyObject *
12643unicode__decimal2ascii(PyObject *self)
12644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012646}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647#endif
12648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012649PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012652Return True if S starts with the specified prefix, False otherwise.\n\
12653With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012654With optional end, stop comparing S at that position.\n\
12655prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
12657static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012658unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012661 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012662 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012663 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012664 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012665 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666
Jesus Ceaac451502011-04-20 17:09:23 +020012667 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012669 if (PyTuple_Check(subobj)) {
12670 Py_ssize_t i;
12671 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012672 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012673 if (substring == NULL)
12674 return NULL;
12675 result = tailmatch(self, substring, start, end, -1);
12676 Py_DECREF(substring);
12677 if (result) {
12678 Py_RETURN_TRUE;
12679 }
12680 }
12681 /* nothing matched */
12682 Py_RETURN_FALSE;
12683 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012684 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012685 if (substring == NULL) {
12686 if (PyErr_ExceptionMatches(PyExc_TypeError))
12687 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12688 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012690 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012691 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012693 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694}
12695
12696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012697PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012700Return True if S ends with the specified suffix, False otherwise.\n\
12701With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012702With optional end, stop comparing S at that position.\n\
12703suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704
12705static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012706unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012709 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012710 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012711 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012712 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012713 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714
Jesus Ceaac451502011-04-20 17:09:23 +020012715 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012717 if (PyTuple_Check(subobj)) {
12718 Py_ssize_t i;
12719 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012720 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012722 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012724 result = tailmatch(self, substring, start, end, +1);
12725 Py_DECREF(substring);
12726 if (result) {
12727 Py_RETURN_TRUE;
12728 }
12729 }
12730 Py_RETURN_FALSE;
12731 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012732 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012733 if (substring == NULL) {
12734 if (PyErr_ExceptionMatches(PyExc_TypeError))
12735 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12736 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012738 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012739 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012741 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742}
12743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012745
12746PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012748\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012749Return a formatted version of S, using substitutions from args and kwargs.\n\
12750The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012751
Eric Smith27bbca62010-11-04 17:06:58 +000012752PyDoc_STRVAR(format_map__doc__,
12753 "S.format_map(mapping) -> str\n\
12754\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012755Return a formatted version of S, using substitutions from mapping.\n\
12756The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012757
Eric Smith4a7d76d2008-05-30 18:10:19 +000012758static PyObject *
12759unicode__format__(PyObject* self, PyObject* args)
12760{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012761 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012762
12763 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12764 return NULL;
12765
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012766 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012768 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012769}
12770
Eric Smith8c663262007-08-25 02:26:07 +000012771PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012773\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012774Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012775
12776static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012777unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 Py_ssize_t size;
12780
12781 /* If it's a compact object, account for base structure +
12782 character data. */
12783 if (PyUnicode_IS_COMPACT_ASCII(v))
12784 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12785 else if (PyUnicode_IS_COMPACT(v))
12786 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012787 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 else {
12789 /* If it is a two-block object, account for base object, and
12790 for character block if present. */
12791 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012792 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012794 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 }
12796 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012797 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012798 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012800 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012801 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802
12803 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012804}
12805
12806PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012808
12809static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012810unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012811{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012812 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 if (!copy)
12814 return NULL;
12815 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012816}
12817
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818static PyMethodDef unicode_methods[] = {
12819
12820 /* Order is according to common usage: often used methods should
12821 appear first, since lookup is done sequentially. */
12822
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012823 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012824 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12825 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012826 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012827 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12828 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12829 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12830 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12831 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12832 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12833 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012834 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012835 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12836 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12837 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012838 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012839 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12840 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12841 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012842 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012843 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012844 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012845 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012846 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12847 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12848 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12849 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12850 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12851 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12852 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12853 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12854 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12855 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12856 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12857 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12858 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12859 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012860 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012861 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012862 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012863 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012864 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012865 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012866 {"maketrans", (PyCFunction) unicode_maketrans,
12867 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012868 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012869#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012870 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871#endif
12872
12873#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012874 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012875 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876#endif
12877
Benjamin Peterson14339b62009-01-31 16:36:08 +000012878 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879 {NULL, NULL}
12880};
12881
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012882static PyObject *
12883unicode_mod(PyObject *v, PyObject *w)
12884{
Brian Curtindfc80e32011-08-10 20:28:54 -050012885 if (!PyUnicode_Check(v))
12886 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012887 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012888}
12889
12890static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012891 0, /*nb_add*/
12892 0, /*nb_subtract*/
12893 0, /*nb_multiply*/
12894 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012895};
12896
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012898 (lenfunc) unicode_length, /* sq_length */
12899 PyUnicode_Concat, /* sq_concat */
12900 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12901 (ssizeargfunc) unicode_getitem, /* sq_item */
12902 0, /* sq_slice */
12903 0, /* sq_ass_item */
12904 0, /* sq_ass_slice */
12905 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906};
12907
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012908static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012909unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 if (PyUnicode_READY(self) == -1)
12912 return NULL;
12913
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012914 if (PyIndex_Check(item)) {
12915 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012916 if (i == -1 && PyErr_Occurred())
12917 return NULL;
12918 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012920 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012921 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012922 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012923 PyObject *result;
12924 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012925 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012926 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012929 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012930 return NULL;
12931 }
12932
12933 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 return PyUnicode_New(0, 0);
12935 } else if (start == 0 && step == 1 &&
12936 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012937 PyUnicode_CheckExact(self)) {
12938 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012939 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000012940 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012941 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012942 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012943 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012944 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012945 src_kind = PyUnicode_KIND(self);
12946 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012947 if (!PyUnicode_IS_ASCII(self)) {
12948 kind_limit = kind_maxchar_limit(src_kind);
12949 max_char = 0;
12950 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12951 ch = PyUnicode_READ(src_kind, src_data, cur);
12952 if (ch > max_char) {
12953 max_char = ch;
12954 if (max_char >= kind_limit)
12955 break;
12956 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012957 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012958 }
Victor Stinner55c99112011-10-13 01:17:06 +020012959 else
12960 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012961 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012962 if (result == NULL)
12963 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012964 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012965 dest_data = PyUnicode_DATA(result);
12966
12967 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012968 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12969 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012970 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012971 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012972 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012973 } else {
12974 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12975 return NULL;
12976 }
12977}
12978
12979static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012980 (lenfunc)unicode_length, /* mp_length */
12981 (binaryfunc)unicode_subscript, /* mp_subscript */
12982 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012983};
12984
Guido van Rossumd57fd912000-03-10 22:53:23 +000012985
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986/* Helpers for PyUnicode_Format() */
12987
12988static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012989getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012991 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012993 (*p_argidx)++;
12994 if (arglen < 0)
12995 return args;
12996 else
12997 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998 }
12999 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013001 return NULL;
13002}
13003
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013004/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013006static PyObject *
13007formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013009 char *p;
13010 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013012
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013 x = PyFloat_AsDouble(v);
13014 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013015 return NULL;
13016
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013018 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013019
Eric Smith0923d1d2009-04-16 20:16:10 +000013020 p = PyOS_double_to_string(x, type, prec,
13021 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013022 if (p == NULL)
13023 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013025 PyMem_Free(p);
13026 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027}
13028
Tim Peters38fd5b62000-09-21 05:43:11 +000013029static PyObject*
13030formatlong(PyObject *val, int flags, int prec, int type)
13031{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013032 char *buf;
13033 int len;
13034 PyObject *str; /* temporary string object. */
13035 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013036
Benjamin Peterson14339b62009-01-31 16:36:08 +000013037 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13038 if (!str)
13039 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013041 Py_DECREF(str);
13042 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013043}
13044
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013045static Py_UCS4
13046formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013048 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013049 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013050 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013051 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013053 goto onError;
13054 }
13055 else {
13056 /* Integer input truncated to a character */
13057 long x;
13058 x = PyLong_AsLong(v);
13059 if (x == -1 && PyErr_Occurred())
13060 goto onError;
13061
13062 if (x < 0 || x > 0x10ffff) {
13063 PyErr_SetString(PyExc_OverflowError,
13064 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013065 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013066 }
13067
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013068 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013069 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013070
Benjamin Peterson29060642009-01-31 22:14:21 +000013071 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013072 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013073 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013074 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075}
13076
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013077static int
13078repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13079{
13080 int r;
13081 assert(count > 0);
13082 assert(PyUnicode_Check(obj));
13083 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013084 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013085 if (repeated == NULL)
13086 return -1;
13087 r = _PyAccu_Accumulate(acc, repeated);
13088 Py_DECREF(repeated);
13089 return r;
13090 }
13091 else {
13092 do {
13093 if (_PyAccu_Accumulate(acc, obj))
13094 return -1;
13095 } while (--count);
13096 return 0;
13097 }
13098}
13099
Alexander Belopolsky40018472011-02-26 01:02:56 +000013100PyObject *
13101PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 void *fmt;
13104 int fmtkind;
13105 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013107 int r;
13108 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013111 PyObject *temp = NULL;
13112 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013113 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013114 _PyAccu acc;
13115 static PyObject *plus, *minus, *blank, *zero, *percent;
13116
13117 if (!plus && !(plus = get_latin1_char('+')))
13118 return NULL;
13119 if (!minus && !(minus = get_latin1_char('-')))
13120 return NULL;
13121 if (!blank && !(blank = get_latin1_char(' ')))
13122 return NULL;
13123 if (!zero && !(zero = get_latin1_char('0')))
13124 return NULL;
13125 if (!percent && !(percent = get_latin1_char('%')))
13126 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013127
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 PyErr_BadInternalCall();
13130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013132 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013135 if (_PyAccu_Init(&acc))
13136 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 fmt = PyUnicode_DATA(uformat);
13138 fmtkind = PyUnicode_KIND(uformat);
13139 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13140 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 arglen = PyTuple_Size(args);
13144 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145 }
13146 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 arglen = -1;
13148 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013150 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013151 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153
13154 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013156 PyObject *nonfmt;
13157 Py_ssize_t nonfmtpos;
13158 nonfmtpos = fmtpos++;
13159 while (fmtcnt >= 0 &&
13160 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13161 fmtpos++;
13162 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013164 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013165 if (nonfmt == NULL)
13166 goto onError;
13167 r = _PyAccu_Accumulate(&acc, nonfmt);
13168 Py_DECREF(nonfmt);
13169 if (r)
13170 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013171 }
13172 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 /* Got a format specifier */
13174 int flags = 0;
13175 Py_ssize_t width = -1;
13176 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013178 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 int isnumok;
13180 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013181 void *pbuf = NULL;
13182 Py_ssize_t pindex, len;
13183 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 fmtpos++;
13186 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13187 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 Py_ssize_t keylen;
13189 PyObject *key;
13190 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013191
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 if (dict == NULL) {
13193 PyErr_SetString(PyExc_TypeError,
13194 "format requires a mapping");
13195 goto onError;
13196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013198 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 /* Skip over balanced parentheses */
13201 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013202 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013203 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013204 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013205 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013209 if (fmtcnt < 0 || pcount > 0) {
13210 PyErr_SetString(PyExc_ValueError,
13211 "incomplete format key");
13212 goto onError;
13213 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013214 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013215 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 if (key == NULL)
13217 goto onError;
13218 if (args_owned) {
13219 Py_DECREF(args);
13220 args_owned = 0;
13221 }
13222 args = PyObject_GetItem(dict, key);
13223 Py_DECREF(key);
13224 if (args == NULL) {
13225 goto onError;
13226 }
13227 args_owned = 1;
13228 arglen = -1;
13229 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013230 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013233 case '-': flags |= F_LJUST; continue;
13234 case '+': flags |= F_SIGN; continue;
13235 case ' ': flags |= F_BLANK; continue;
13236 case '#': flags |= F_ALT; continue;
13237 case '0': flags |= F_ZERO; continue;
13238 }
13239 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013240 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013241 if (c == '*') {
13242 v = getnextarg(args, arglen, &argidx);
13243 if (v == NULL)
13244 goto onError;
13245 if (!PyLong_Check(v)) {
13246 PyErr_SetString(PyExc_TypeError,
13247 "* wants int");
13248 goto onError;
13249 }
13250 width = PyLong_AsLong(v);
13251 if (width == -1 && PyErr_Occurred())
13252 goto onError;
13253 if (width < 0) {
13254 flags |= F_LJUST;
13255 width = -width;
13256 }
13257 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013258 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013259 }
13260 else if (c >= '0' && c <= '9') {
13261 width = c - '0';
13262 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013263 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013264 if (c < '0' || c > '9')
13265 break;
13266 if ((width*10) / 10 != width) {
13267 PyErr_SetString(PyExc_ValueError,
13268 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013269 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013270 }
13271 width = width*10 + (c - '0');
13272 }
13273 }
13274 if (c == '.') {
13275 prec = 0;
13276 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 if (c == '*') {
13279 v = getnextarg(args, arglen, &argidx);
13280 if (v == NULL)
13281 goto onError;
13282 if (!PyLong_Check(v)) {
13283 PyErr_SetString(PyExc_TypeError,
13284 "* wants int");
13285 goto onError;
13286 }
13287 prec = PyLong_AsLong(v);
13288 if (prec == -1 && PyErr_Occurred())
13289 goto onError;
13290 if (prec < 0)
13291 prec = 0;
13292 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013294 }
13295 else if (c >= '0' && c <= '9') {
13296 prec = c - '0';
13297 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 if (c < '0' || c > '9')
13300 break;
13301 if ((prec*10) / 10 != prec) {
13302 PyErr_SetString(PyExc_ValueError,
13303 "prec too big");
13304 goto onError;
13305 }
13306 prec = prec*10 + (c - '0');
13307 }
13308 }
13309 } /* prec */
13310 if (fmtcnt >= 0) {
13311 if (c == 'h' || c == 'l' || c == 'L') {
13312 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 }
13315 }
13316 if (fmtcnt < 0) {
13317 PyErr_SetString(PyExc_ValueError,
13318 "incomplete format");
13319 goto onError;
13320 }
13321 if (c != '%') {
13322 v = getnextarg(args, arglen, &argidx);
13323 if (v == NULL)
13324 goto onError;
13325 }
13326 sign = 0;
13327 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013328 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013329 switch (c) {
13330
13331 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013332 _PyAccu_Accumulate(&acc, percent);
13333 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013334
13335 case 's':
13336 case 'r':
13337 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013338 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013339 temp = v;
13340 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013341 }
13342 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 if (c == 's')
13344 temp = PyObject_Str(v);
13345 else if (c == 'r')
13346 temp = PyObject_Repr(v);
13347 else
13348 temp = PyObject_ASCII(v);
13349 if (temp == NULL)
13350 goto onError;
13351 if (PyUnicode_Check(temp))
13352 /* nothing to do */;
13353 else {
13354 Py_DECREF(temp);
13355 PyErr_SetString(PyExc_TypeError,
13356 "%s argument has non-string str()");
13357 goto onError;
13358 }
13359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013360 if (PyUnicode_READY(temp) == -1) {
13361 Py_CLEAR(temp);
13362 goto onError;
13363 }
13364 pbuf = PyUnicode_DATA(temp);
13365 kind = PyUnicode_KIND(temp);
13366 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 if (prec >= 0 && len > prec)
13368 len = prec;
13369 break;
13370
13371 case 'i':
13372 case 'd':
13373 case 'u':
13374 case 'o':
13375 case 'x':
13376 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 isnumok = 0;
13378 if (PyNumber_Check(v)) {
13379 PyObject *iobj=NULL;
13380
13381 if (PyLong_Check(v)) {
13382 iobj = v;
13383 Py_INCREF(iobj);
13384 }
13385 else {
13386 iobj = PyNumber_Long(v);
13387 }
13388 if (iobj!=NULL) {
13389 if (PyLong_Check(iobj)) {
13390 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013391 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 Py_DECREF(iobj);
13393 if (!temp)
13394 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 if (PyUnicode_READY(temp) == -1) {
13396 Py_CLEAR(temp);
13397 goto onError;
13398 }
13399 pbuf = PyUnicode_DATA(temp);
13400 kind = PyUnicode_KIND(temp);
13401 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 sign = 1;
13403 }
13404 else {
13405 Py_DECREF(iobj);
13406 }
13407 }
13408 }
13409 if (!isnumok) {
13410 PyErr_Format(PyExc_TypeError,
13411 "%%%c format: a number is required, "
13412 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13413 goto onError;
13414 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013415 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013417 fillobj = zero;
13418 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 break;
13420
13421 case 'e':
13422 case 'E':
13423 case 'f':
13424 case 'F':
13425 case 'g':
13426 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013427 temp = formatfloat(v, flags, prec, c);
13428 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013430 if (PyUnicode_READY(temp) == -1) {
13431 Py_CLEAR(temp);
13432 goto onError;
13433 }
13434 pbuf = PyUnicode_DATA(temp);
13435 kind = PyUnicode_KIND(temp);
13436 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013438 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013440 fillobj = zero;
13441 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 break;
13443
13444 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013445 {
13446 Py_UCS4 ch = formatchar(v);
13447 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013449 temp = _PyUnicode_FromUCS4(&ch, 1);
13450 if (temp == NULL)
13451 goto onError;
13452 pbuf = PyUnicode_DATA(temp);
13453 kind = PyUnicode_KIND(temp);
13454 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013456 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013457
13458 default:
13459 PyErr_Format(PyExc_ValueError,
13460 "unsupported format character '%c' (0x%x) "
13461 "at index %zd",
13462 (31<=c && c<=126) ? (char)c : '?',
13463 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013464 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 goto onError;
13466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013467 /* pbuf is initialized here. */
13468 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013470 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13471 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013473 pindex++;
13474 }
13475 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13476 signobj = plus;
13477 len--;
13478 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 }
13480 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013481 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013483 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013484 else
13485 sign = 0;
13486 }
13487 if (width < len)
13488 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013490 if (fill != ' ') {
13491 assert(signobj != NULL);
13492 if (_PyAccu_Accumulate(&acc, signobj))
13493 goto onError;
13494 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013495 if (width > len)
13496 width--;
13497 }
13498 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013499 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013500 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013502 second = get_latin1_char(
13503 PyUnicode_READ(kind, pbuf, pindex + 1));
13504 pindex += 2;
13505 if (second == NULL ||
13506 _PyAccu_Accumulate(&acc, zero) ||
13507 _PyAccu_Accumulate(&acc, second))
13508 goto onError;
13509 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013511 width -= 2;
13512 if (width < 0)
13513 width = 0;
13514 len -= 2;
13515 }
13516 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013517 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013518 if (repeat_accumulate(&acc, fillobj, width - len))
13519 goto onError;
13520 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 }
13522 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013523 if (sign) {
13524 assert(signobj != NULL);
13525 if (_PyAccu_Accumulate(&acc, signobj))
13526 goto onError;
13527 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013528 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013529 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13530 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013531 second = get_latin1_char(
13532 PyUnicode_READ(kind, pbuf, pindex + 1));
13533 pindex += 2;
13534 if (second == NULL ||
13535 _PyAccu_Accumulate(&acc, zero) ||
13536 _PyAccu_Accumulate(&acc, second))
13537 goto onError;
13538 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013539 }
13540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013541 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013542 if (temp != NULL) {
13543 assert(pbuf == PyUnicode_DATA(temp));
13544 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013545 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013546 else {
13547 const char *p = (const char *) pbuf;
13548 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013549 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013550 v = PyUnicode_FromKindAndData(kind, p, len);
13551 }
13552 if (v == NULL)
13553 goto onError;
13554 r = _PyAccu_Accumulate(&acc, v);
13555 Py_DECREF(v);
13556 if (r)
13557 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013558 if (width > len && repeat_accumulate(&acc, blank, width - len))
13559 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 if (dict && (argidx < arglen) && c != '%') {
13561 PyErr_SetString(PyExc_TypeError,
13562 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 goto onError;
13564 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013565 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013567 } /* until end */
13568 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 PyErr_SetString(PyExc_TypeError,
13570 "not all arguments converted during string formatting");
13571 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013572 }
13573
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013574 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013575 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013576 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013577 }
13578 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013579 Py_XDECREF(temp);
13580 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013581 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013584 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013585 Py_XDECREF(temp);
13586 Py_XDECREF(second);
13587 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013588 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013590 }
13591 return NULL;
13592}
13593
Jeremy Hylton938ace62002-07-17 16:30:39 +000013594static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013595unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13596
Tim Peters6d6c1a32001-08-02 04:15:00 +000013597static PyObject *
13598unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13599{
Benjamin Peterson29060642009-01-31 22:14:21 +000013600 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013601 static char *kwlist[] = {"object", "encoding", "errors", 0};
13602 char *encoding = NULL;
13603 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013604
Benjamin Peterson14339b62009-01-31 16:36:08 +000013605 if (type != &PyUnicode_Type)
13606 return unicode_subtype_new(type, args, kwds);
13607 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013609 return NULL;
13610 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013611 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013612 if (encoding == NULL && errors == NULL)
13613 return PyObject_Str(x);
13614 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013616}
13617
Guido van Rossume023fe02001-08-30 03:12:59 +000013618static PyObject *
13619unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13620{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013621 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013622 Py_ssize_t length, char_size;
13623 int share_wstr, share_utf8;
13624 unsigned int kind;
13625 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013626
Benjamin Peterson14339b62009-01-31 16:36:08 +000013627 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013628
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013629 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013630 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013631 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013632 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013633 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013634 return NULL;
13635
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013636 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013637 if (self == NULL) {
13638 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013639 return NULL;
13640 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013641 kind = PyUnicode_KIND(unicode);
13642 length = PyUnicode_GET_LENGTH(unicode);
13643
13644 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013645#ifdef Py_DEBUG
13646 _PyUnicode_HASH(self) = -1;
13647#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013648 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013649#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013650 _PyUnicode_STATE(self).interned = 0;
13651 _PyUnicode_STATE(self).kind = kind;
13652 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013653 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013654 _PyUnicode_STATE(self).ready = 1;
13655 _PyUnicode_WSTR(self) = NULL;
13656 _PyUnicode_UTF8_LENGTH(self) = 0;
13657 _PyUnicode_UTF8(self) = NULL;
13658 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013659 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013660
13661 share_utf8 = 0;
13662 share_wstr = 0;
13663 if (kind == PyUnicode_1BYTE_KIND) {
13664 char_size = 1;
13665 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13666 share_utf8 = 1;
13667 }
13668 else if (kind == PyUnicode_2BYTE_KIND) {
13669 char_size = 2;
13670 if (sizeof(wchar_t) == 2)
13671 share_wstr = 1;
13672 }
13673 else {
13674 assert(kind == PyUnicode_4BYTE_KIND);
13675 char_size = 4;
13676 if (sizeof(wchar_t) == 4)
13677 share_wstr = 1;
13678 }
13679
13680 /* Ensure we won't overflow the length. */
13681 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13682 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013683 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013684 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013685 data = PyObject_MALLOC((length + 1) * char_size);
13686 if (data == NULL) {
13687 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013688 goto onError;
13689 }
13690
Victor Stinnerc3c74152011-10-02 20:39:55 +020013691 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013692 if (share_utf8) {
13693 _PyUnicode_UTF8_LENGTH(self) = length;
13694 _PyUnicode_UTF8(self) = data;
13695 }
13696 if (share_wstr) {
13697 _PyUnicode_WSTR_LENGTH(self) = length;
13698 _PyUnicode_WSTR(self) = (wchar_t *)data;
13699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013700
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013701 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013702 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013703 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013704#ifdef Py_DEBUG
13705 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13706#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013707 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013708 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013709
13710onError:
13711 Py_DECREF(unicode);
13712 Py_DECREF(self);
13713 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013714}
13715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013716PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013718\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013719Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013720encoding defaults to the current default string encoding.\n\
13721errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013722
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013723static PyObject *unicode_iter(PyObject *seq);
13724
Guido van Rossumd57fd912000-03-10 22:53:23 +000013725PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013726 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013727 "str", /* tp_name */
13728 sizeof(PyUnicodeObject), /* tp_size */
13729 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013730 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013731 (destructor)unicode_dealloc, /* tp_dealloc */
13732 0, /* tp_print */
13733 0, /* tp_getattr */
13734 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013735 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013736 unicode_repr, /* tp_repr */
13737 &unicode_as_number, /* tp_as_number */
13738 &unicode_as_sequence, /* tp_as_sequence */
13739 &unicode_as_mapping, /* tp_as_mapping */
13740 (hashfunc) unicode_hash, /* tp_hash*/
13741 0, /* tp_call*/
13742 (reprfunc) unicode_str, /* tp_str */
13743 PyObject_GenericGetAttr, /* tp_getattro */
13744 0, /* tp_setattro */
13745 0, /* tp_as_buffer */
13746 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013747 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013748 unicode_doc, /* tp_doc */
13749 0, /* tp_traverse */
13750 0, /* tp_clear */
13751 PyUnicode_RichCompare, /* tp_richcompare */
13752 0, /* tp_weaklistoffset */
13753 unicode_iter, /* tp_iter */
13754 0, /* tp_iternext */
13755 unicode_methods, /* tp_methods */
13756 0, /* tp_members */
13757 0, /* tp_getset */
13758 &PyBaseObject_Type, /* tp_base */
13759 0, /* tp_dict */
13760 0, /* tp_descr_get */
13761 0, /* tp_descr_set */
13762 0, /* tp_dictoffset */
13763 0, /* tp_init */
13764 0, /* tp_alloc */
13765 unicode_new, /* tp_new */
13766 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013767};
13768
13769/* Initialize the Unicode implementation */
13770
Victor Stinner3a50e702011-10-18 21:21:00 +020013771int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013773 int i;
13774
Thomas Wouters477c8d52006-05-27 19:21:47 +000013775 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013776 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013777 0x000A, /* LINE FEED */
13778 0x000D, /* CARRIAGE RETURN */
13779 0x001C, /* FILE SEPARATOR */
13780 0x001D, /* GROUP SEPARATOR */
13781 0x001E, /* RECORD SEPARATOR */
13782 0x0085, /* NEXT LINE */
13783 0x2028, /* LINE SEPARATOR */
13784 0x2029, /* PARAGRAPH SEPARATOR */
13785 };
13786
Fred Drakee4315f52000-05-09 19:53:39 +000013787 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013788 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013789 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013791 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013792
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013793 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013794 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013795 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013796 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013797
13798 /* initialize the linebreak bloom filter */
13799 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013800 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013801 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013802
13803 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013804
13805#ifdef HAVE_MBCS
13806 winver.dwOSVersionInfoSize = sizeof(winver);
13807 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13808 PyErr_SetFromWindowsErr(0);
13809 return -1;
13810 }
13811#endif
13812 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013813}
13814
13815/* Finalize the Unicode implementation */
13816
Christian Heimesa156e092008-02-16 07:38:31 +000013817int
13818PyUnicode_ClearFreeList(void)
13819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013820 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013821}
13822
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823void
Thomas Wouters78890102000-07-22 19:25:51 +000013824_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013825{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013826 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013827
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013828 Py_XDECREF(unicode_empty);
13829 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013830
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013831 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013832 if (unicode_latin1[i]) {
13833 Py_DECREF(unicode_latin1[i]);
13834 unicode_latin1[i] = NULL;
13835 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013836 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013837 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013838 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013839}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013840
Walter Dörwald16807132007-05-25 13:52:07 +000013841void
13842PyUnicode_InternInPlace(PyObject **p)
13843{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013844 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013845 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013846#ifdef Py_DEBUG
13847 assert(s != NULL);
13848 assert(_PyUnicode_CHECK(s));
13849#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013850 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013851 return;
13852#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013853 /* If it's a subclass, we don't really know what putting
13854 it in the interned dict might do. */
13855 if (!PyUnicode_CheckExact(s))
13856 return;
13857 if (PyUnicode_CHECK_INTERNED(s))
13858 return;
13859 if (interned == NULL) {
13860 interned = PyDict_New();
13861 if (interned == NULL) {
13862 PyErr_Clear(); /* Don't leave an exception */
13863 return;
13864 }
13865 }
13866 /* It might be that the GetItem call fails even
13867 though the key is present in the dictionary,
13868 namely when this happens during a stack overflow. */
13869 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013870 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013872
Benjamin Peterson29060642009-01-31 22:14:21 +000013873 if (t) {
13874 Py_INCREF(t);
13875 Py_DECREF(*p);
13876 *p = t;
13877 return;
13878 }
Walter Dörwald16807132007-05-25 13:52:07 +000013879
Benjamin Peterson14339b62009-01-31 16:36:08 +000013880 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013881 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013882 PyErr_Clear();
13883 PyThreadState_GET()->recursion_critical = 0;
13884 return;
13885 }
13886 PyThreadState_GET()->recursion_critical = 0;
13887 /* The two references in interned are not counted by refcnt.
13888 The deallocator will take care of this */
13889 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013890 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013891}
13892
13893void
13894PyUnicode_InternImmortal(PyObject **p)
13895{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013896 PyUnicode_InternInPlace(p);
13897 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013898 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013899 Py_INCREF(*p);
13900 }
Walter Dörwald16807132007-05-25 13:52:07 +000013901}
13902
13903PyObject *
13904PyUnicode_InternFromString(const char *cp)
13905{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013906 PyObject *s = PyUnicode_FromString(cp);
13907 if (s == NULL)
13908 return NULL;
13909 PyUnicode_InternInPlace(&s);
13910 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013911}
13912
Alexander Belopolsky40018472011-02-26 01:02:56 +000013913void
13914_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013915{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013916 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013917 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013918 Py_ssize_t i, n;
13919 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013920
Benjamin Peterson14339b62009-01-31 16:36:08 +000013921 if (interned == NULL || !PyDict_Check(interned))
13922 return;
13923 keys = PyDict_Keys(interned);
13924 if (keys == NULL || !PyList_Check(keys)) {
13925 PyErr_Clear();
13926 return;
13927 }
Walter Dörwald16807132007-05-25 13:52:07 +000013928
Benjamin Peterson14339b62009-01-31 16:36:08 +000013929 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13930 detector, interned unicode strings are not forcibly deallocated;
13931 rather, we give them their stolen references back, and then clear
13932 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013933
Benjamin Peterson14339b62009-01-31 16:36:08 +000013934 n = PyList_GET_SIZE(keys);
13935 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013936 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013937 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013938 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013939 if (PyUnicode_READY(s) == -1) {
13940 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013941 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013943 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013944 case SSTATE_NOT_INTERNED:
13945 /* XXX Shouldn't happen */
13946 break;
13947 case SSTATE_INTERNED_IMMORTAL:
13948 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 break;
13951 case SSTATE_INTERNED_MORTAL:
13952 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013953 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 break;
13955 default:
13956 Py_FatalError("Inconsistent interned string state.");
13957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013958 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013959 }
13960 fprintf(stderr, "total size of all interned strings: "
13961 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13962 "mortal/immortal\n", mortal_size, immortal_size);
13963 Py_DECREF(keys);
13964 PyDict_Clear(interned);
13965 Py_DECREF(interned);
13966 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013967}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013968
13969
13970/********************* Unicode Iterator **************************/
13971
13972typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 PyObject_HEAD
13974 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013975 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013976} unicodeiterobject;
13977
13978static void
13979unicodeiter_dealloc(unicodeiterobject *it)
13980{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013981 _PyObject_GC_UNTRACK(it);
13982 Py_XDECREF(it->it_seq);
13983 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013984}
13985
13986static int
13987unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13988{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 Py_VISIT(it->it_seq);
13990 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013991}
13992
13993static PyObject *
13994unicodeiter_next(unicodeiterobject *it)
13995{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013996 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013997
Benjamin Peterson14339b62009-01-31 16:36:08 +000013998 assert(it != NULL);
13999 seq = it->it_seq;
14000 if (seq == NULL)
14001 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014002 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014004 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14005 int kind = PyUnicode_KIND(seq);
14006 void *data = PyUnicode_DATA(seq);
14007 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14008 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014009 if (item != NULL)
14010 ++it->it_index;
14011 return item;
14012 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014013
Benjamin Peterson14339b62009-01-31 16:36:08 +000014014 Py_DECREF(seq);
14015 it->it_seq = NULL;
14016 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014017}
14018
14019static PyObject *
14020unicodeiter_len(unicodeiterobject *it)
14021{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014022 Py_ssize_t len = 0;
14023 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014024 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014026}
14027
14028PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14029
14030static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014032 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014034};
14035
14036PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014037 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14038 "str_iterator", /* tp_name */
14039 sizeof(unicodeiterobject), /* tp_basicsize */
14040 0, /* tp_itemsize */
14041 /* methods */
14042 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14043 0, /* tp_print */
14044 0, /* tp_getattr */
14045 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014046 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 0, /* tp_repr */
14048 0, /* tp_as_number */
14049 0, /* tp_as_sequence */
14050 0, /* tp_as_mapping */
14051 0, /* tp_hash */
14052 0, /* tp_call */
14053 0, /* tp_str */
14054 PyObject_GenericGetAttr, /* tp_getattro */
14055 0, /* tp_setattro */
14056 0, /* tp_as_buffer */
14057 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14058 0, /* tp_doc */
14059 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14060 0, /* tp_clear */
14061 0, /* tp_richcompare */
14062 0, /* tp_weaklistoffset */
14063 PyObject_SelfIter, /* tp_iter */
14064 (iternextfunc)unicodeiter_next, /* tp_iternext */
14065 unicodeiter_methods, /* tp_methods */
14066 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014067};
14068
14069static PyObject *
14070unicode_iter(PyObject *seq)
14071{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014072 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014073
Benjamin Peterson14339b62009-01-31 16:36:08 +000014074 if (!PyUnicode_Check(seq)) {
14075 PyErr_BadInternalCall();
14076 return NULL;
14077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014078 if (PyUnicode_READY(seq) == -1)
14079 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014080 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14081 if (it == NULL)
14082 return NULL;
14083 it->it_index = 0;
14084 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014085 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 _PyObject_GC_TRACK(it);
14087 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014088}
14089
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014090
14091size_t
14092Py_UNICODE_strlen(const Py_UNICODE *u)
14093{
14094 int res = 0;
14095 while(*u++)
14096 res++;
14097 return res;
14098}
14099
14100Py_UNICODE*
14101Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14102{
14103 Py_UNICODE *u = s1;
14104 while ((*u++ = *s2++));
14105 return s1;
14106}
14107
14108Py_UNICODE*
14109Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14110{
14111 Py_UNICODE *u = s1;
14112 while ((*u++ = *s2++))
14113 if (n-- == 0)
14114 break;
14115 return s1;
14116}
14117
14118Py_UNICODE*
14119Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14120{
14121 Py_UNICODE *u1 = s1;
14122 u1 += Py_UNICODE_strlen(u1);
14123 Py_UNICODE_strcpy(u1, s2);
14124 return s1;
14125}
14126
14127int
14128Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14129{
14130 while (*s1 && *s2 && *s1 == *s2)
14131 s1++, s2++;
14132 if (*s1 && *s2)
14133 return (*s1 < *s2) ? -1 : +1;
14134 if (*s1)
14135 return 1;
14136 if (*s2)
14137 return -1;
14138 return 0;
14139}
14140
14141int
14142Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14143{
14144 register Py_UNICODE u1, u2;
14145 for (; n != 0; n--) {
14146 u1 = *s1;
14147 u2 = *s2;
14148 if (u1 != u2)
14149 return (u1 < u2) ? -1 : +1;
14150 if (u1 == '\0')
14151 return 0;
14152 s1++;
14153 s2++;
14154 }
14155 return 0;
14156}
14157
14158Py_UNICODE*
14159Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14160{
14161 const Py_UNICODE *p;
14162 for (p = s; *p; p++)
14163 if (*p == c)
14164 return (Py_UNICODE*)p;
14165 return NULL;
14166}
14167
14168Py_UNICODE*
14169Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14170{
14171 const Py_UNICODE *p;
14172 p = s + Py_UNICODE_strlen(s);
14173 while (p != s) {
14174 p--;
14175 if (*p == c)
14176 return (Py_UNICODE*)p;
14177 }
14178 return NULL;
14179}
Victor Stinner331ea922010-08-10 16:37:20 +000014180
Victor Stinner71133ff2010-09-01 23:43:53 +000014181Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014182PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014183{
Victor Stinner577db2c2011-10-11 22:12:48 +020014184 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014185 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014187 if (!PyUnicode_Check(unicode)) {
14188 PyErr_BadArgument();
14189 return NULL;
14190 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014191 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014192 if (u == NULL)
14193 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014194 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014195 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014196 PyErr_NoMemory();
14197 return NULL;
14198 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014199 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014200 size *= sizeof(Py_UNICODE);
14201 copy = PyMem_Malloc(size);
14202 if (copy == NULL) {
14203 PyErr_NoMemory();
14204 return NULL;
14205 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014206 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014207 return copy;
14208}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014209
Georg Brandl66c221e2010-10-14 07:04:07 +000014210/* A _string module, to export formatter_parser and formatter_field_name_split
14211 to the string.Formatter class implemented in Python. */
14212
14213static PyMethodDef _string_methods[] = {
14214 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14215 METH_O, PyDoc_STR("split the argument as a field name")},
14216 {"formatter_parser", (PyCFunction) formatter_parser,
14217 METH_O, PyDoc_STR("parse the argument as a format string")},
14218 {NULL, NULL}
14219};
14220
14221static struct PyModuleDef _string_module = {
14222 PyModuleDef_HEAD_INIT,
14223 "_string",
14224 PyDoc_STR("string helper module"),
14225 0,
14226 _string_methods,
14227 NULL,
14228 NULL,
14229 NULL,
14230 NULL
14231};
14232
14233PyMODINIT_FUNC
14234PyInit__string(void)
14235{
14236 return PyModule_Create(&_string_module);
14237}
14238
14239
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014240#ifdef __cplusplus
14241}
14242#endif