blob: f9117f15ee836b799af4845df4877d9348d1e1a0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200204/* List of static strings. */
205static _Py_Identifier *static_strings;
206
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207/* Single character Unicode strings in the Latin-1 range are being
208 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200209static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210
Christian Heimes190d79e2008-01-30 11:58:22 +0000211/* Fast detection of the most frequent whitespace characters */
212const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000214/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000215/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000216/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000217/* case 0x000C: * FORM FEED */
218/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 1, 1, 1, 1, 1, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x001C: * FILE SEPARATOR */
222/* case 0x001D: * GROUP SEPARATOR */
223/* case 0x001E: * RECORD SEPARATOR */
224/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 1, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000231
Benjamin Peterson14339b62009-01-31 16:36:08 +0000232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000240};
241
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200242/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200243static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200244static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200245static void copy_characters(
246 PyObject *to, Py_ssize_t to_start,
247 PyObject *from, Py_ssize_t from_start,
248 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200249#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200250static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200251#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253static PyObject *
254unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000255 PyObject **errorHandler,const char *encoding, const char *reason,
256 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259static void
260raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300261 const char *encoding,
262 const Py_UNICODE *unicode, Py_ssize_t size,
263 Py_ssize_t startpos, Py_ssize_t endpos,
264 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000265
Christian Heimes190d79e2008-01-30 11:58:22 +0000266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* 0x000B, * LINE TABULATION */
271/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000273 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* 0x001C, * FILE SEPARATOR */
276/* 0x001D, * GROUP SEPARATOR */
277/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 1, 1, 1, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000283
Benjamin Peterson14339b62009-01-31 16:36:08 +0000284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000292};
293
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300294/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
295 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000297PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000299#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 /* This is actually an illegal character, so it should
303 not be passed to unichr. */
304 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000305#endif
306}
307
Victor Stinner910337b2011-10-03 03:20:16 +0200308#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200309int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200310/* FIXME: use PyObject* type for op */
311_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
337 } else {
338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
339
340 data = unicode->data.any;
341 if (kind == PyUnicode_WCHAR_KIND) {
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ascii == 0);
344 assert(ascii->state.ready == 0);
345 assert(ascii->wstr != NULL);
346 assert(data == NULL);
347 assert(compact->utf8 == NULL);
348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
349 }
350 else {
351 assert(kind == PyUnicode_1BYTE_KIND
352 || kind == PyUnicode_2BYTE_KIND
353 || kind == PyUnicode_4BYTE_KIND);
354 assert(ascii->state.compact == 0);
355 assert(ascii->state.ready == 1);
356 assert(data != NULL);
357 if (ascii->state.ascii) {
358 assert (compact->utf8 == data);
359 assert (compact->utf8_length == ascii->length);
360 }
361 else
362 assert (compact->utf8 != data);
363 }
364 }
365 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200366 if (
367#if SIZEOF_WCHAR_T == 2
368 kind == PyUnicode_2BYTE_KIND
369#else
370 kind == PyUnicode_4BYTE_KIND
371#endif
372 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 {
374 assert(ascii->wstr == data);
375 assert(compact->wstr_length == ascii->length);
376 } else
377 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200378 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200379
380 if (compact->utf8 == NULL)
381 assert(compact->utf8_length == 0);
382 if (ascii->wstr == NULL)
383 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200384 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200385 /* check that the best kind is used */
386 if (check_content && kind != PyUnicode_WCHAR_KIND)
387 {
388 Py_ssize_t i;
389 Py_UCS4 maxchar = 0;
390 void *data = PyUnicode_DATA(ascii);
391 for (i=0; i < ascii->length; i++)
392 {
393 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
394 if (ch > maxchar)
395 maxchar = ch;
396 }
397 if (kind == PyUnicode_1BYTE_KIND) {
398 if (ascii->state.ascii == 0)
399 assert(maxchar >= 128);
400 else
401 assert(maxchar < 128);
402 }
403 else if (kind == PyUnicode_2BYTE_KIND)
404 assert(maxchar >= 0x100);
405 else
406 assert(maxchar >= 0x10000);
407 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200408 if (check_content && !unicode_is_singleton((PyObject*)ascii))
409 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400410 return 1;
411}
Victor Stinner910337b2011-10-03 03:20:16 +0200412#endif
413
Thomas Wouters477c8d52006-05-27 19:21:47 +0000414/* --- Bloom Filters ----------------------------------------------------- */
415
416/* stuff to implement simple "bloom filters" for Unicode characters.
417 to keep things simple, we use a single bitmask, using the least 5
418 bits from each unicode characters as the bit index. */
419
420/* the linebreak mask is set up by Unicode_Init below */
421
Antoine Pitrouf068f942010-01-13 14:19:12 +0000422#if LONG_BIT >= 128
423#define BLOOM_WIDTH 128
424#elif LONG_BIT >= 64
425#define BLOOM_WIDTH 64
426#elif LONG_BIT >= 32
427#define BLOOM_WIDTH 32
428#else
429#error "LONG_BIT is smaller than 32"
430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432#define BLOOM_MASK unsigned long
433
434static BLOOM_MASK bloom_linebreak;
435
Antoine Pitrouf068f942010-01-13 14:19:12 +0000436#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
437#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000438
Benjamin Peterson29060642009-01-31 22:14:21 +0000439#define BLOOM_LINEBREAK(ch) \
440 ((ch) < 128U ? ascii_linebreak[(ch)] : \
441 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442
Alexander Belopolsky40018472011-02-26 01:02:56 +0000443Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200444make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000445{
446 /* calculate simple bloom-style bitmask for a given unicode string */
447
Antoine Pitrouf068f942010-01-13 14:19:12 +0000448 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449 Py_ssize_t i;
450
451 mask = 0;
452 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200453 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000454
455 return mask;
456}
457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458#define BLOOM_MEMBER(mask, chr, str) \
459 (BLOOM(mask, chr) \
460 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462/* --- Unicode Object ----------------------------------------------------- */
463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200465fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466
467Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
468 Py_ssize_t size, Py_UCS4 ch,
469 int direction)
470{
471 /* like wcschr, but doesn't stop at NULL characters */
472 Py_ssize_t i;
473 if (direction == 1) {
474 for(i = 0; i < size; i++)
475 if (PyUnicode_READ(kind, s, i) == ch)
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200476 return (char*)s + kind * i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200477 }
478 else {
479 for(i = size-1; i >= 0; i--)
480 if (PyUnicode_READ(kind, s, i) == ch)
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200481 return (char*)s + kind * i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200482 }
483 return NULL;
484}
485
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486static PyObject*
487resize_compact(PyObject *unicode, Py_ssize_t length)
488{
489 Py_ssize_t char_size;
490 Py_ssize_t struct_size;
491 Py_ssize_t new_size;
492 int share_wstr;
493
494 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200495 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200496 if (PyUnicode_IS_COMPACT_ASCII(unicode))
497 struct_size = sizeof(PyASCIIObject);
498 else
499 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200500 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200501
502 _Py_DEC_REFTOTAL;
503 _Py_ForgetReference(unicode);
504
505 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
506 PyErr_NoMemory();
507 return NULL;
508 }
509 new_size = (struct_size + (length + 1) * char_size);
510
511 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
512 if (unicode == NULL) {
513 PyObject_Del(unicode);
514 PyErr_NoMemory();
515 return NULL;
516 }
517 _Py_NewReference(unicode);
518 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200519 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200520 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200521 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
522 _PyUnicode_WSTR_LENGTH(unicode) = length;
523 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200524 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
525 length, 0);
526 return unicode;
527}
528
Alexander Belopolsky40018472011-02-26 01:02:56 +0000529static int
Victor Stinner95663112011-10-04 01:03:50 +0200530resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531{
Victor Stinner95663112011-10-04 01:03:50 +0200532 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000535
Victor Stinner95663112011-10-04 01:03:50 +0200536 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200537
538 if (PyUnicode_IS_READY(unicode)) {
539 Py_ssize_t char_size;
540 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200541 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 void *data;
543
544 data = _PyUnicode_DATA_ANY(unicode);
545 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200546 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200547 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
548 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200549 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
550 {
551 PyObject_DEL(_PyUnicode_UTF8(unicode));
552 _PyUnicode_UTF8(unicode) = NULL;
553 _PyUnicode_UTF8_LENGTH(unicode) = 0;
554 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200555
556 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
557 PyErr_NoMemory();
558 return -1;
559 }
560 new_size = (length + 1) * char_size;
561
562 data = (PyObject *)PyObject_REALLOC(data, new_size);
563 if (data == NULL) {
564 PyErr_NoMemory();
565 return -1;
566 }
567 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200568 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200570 _PyUnicode_WSTR_LENGTH(unicode) = length;
571 }
572 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200573 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200574 _PyUnicode_UTF8_LENGTH(unicode) = length;
575 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200576 _PyUnicode_LENGTH(unicode) = length;
577 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200578 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200580 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200581 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200582 }
Victor Stinner95663112011-10-04 01:03:50 +0200583 assert(_PyUnicode_WSTR(unicode) != NULL);
584
585 /* check for integer overflow */
586 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
587 PyErr_NoMemory();
588 return -1;
589 }
590 wstr = _PyUnicode_WSTR(unicode);
591 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
592 if (!wstr) {
593 PyErr_NoMemory();
594 return -1;
595 }
596 _PyUnicode_WSTR(unicode) = wstr;
597 _PyUnicode_WSTR(unicode)[length] = 0;
598 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200599 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600 return 0;
601}
602
Victor Stinnerfe226c02011-10-03 03:52:20 +0200603static PyObject*
604resize_copy(PyObject *unicode, Py_ssize_t length)
605{
606 Py_ssize_t copy_length;
607 if (PyUnicode_IS_COMPACT(unicode)) {
608 PyObject *copy;
609 assert(PyUnicode_IS_READY(unicode));
610
611 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
612 if (copy == NULL)
613 return NULL;
614
615 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200616 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200617 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200618 }
619 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200620 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621 assert(_PyUnicode_WSTR(unicode) != NULL);
622 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200623 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200624 if (w == NULL)
625 return NULL;
626 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
627 copy_length = Py_MIN(copy_length, length);
628 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
629 copy_length);
630 return (PyObject*)w;
631 }
632}
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000635 Ux0000 terminated; some code (e.g. new_identifier)
636 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637
638 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000639 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641*/
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643#ifdef Py_DEBUG
644int unicode_old_new_calls = 0;
645#endif
646
Alexander Belopolsky40018472011-02-26 01:02:56 +0000647static PyUnicodeObject *
648_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649{
650 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652
Thomas Wouters477c8d52006-05-27 19:21:47 +0000653 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 if (length == 0 && unicode_empty != NULL) {
655 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200656 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 }
658
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000659 /* Ensure we won't overflow the size. */
660 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
661 return (PyUnicodeObject *)PyErr_NoMemory();
662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663 if (length < 0) {
664 PyErr_SetString(PyExc_SystemError,
665 "Negative size passed to _PyUnicode_New");
666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669#ifdef Py_DEBUG
670 ++unicode_old_new_calls;
671#endif
672
673 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
674 if (unicode == NULL)
675 return NULL;
676 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
677 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
678 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000679 PyErr_NoMemory();
680 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200682
Jeremy Hyltond8082792003-09-16 19:41:39 +0000683 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000684 * the caller fails before initializing str -- unicode_resize()
685 * reads str[0], and the Keep-Alive optimization can keep memory
686 * allocated for str alive across a call to unicode_dealloc(unicode).
687 * We don't want unicode_resize to read uninitialized memory in
688 * that case.
689 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200690 _PyUnicode_WSTR(unicode)[0] = 0;
691 _PyUnicode_WSTR(unicode)[length] = 0;
692 _PyUnicode_WSTR_LENGTH(unicode) = length;
693 _PyUnicode_HASH(unicode) = -1;
694 _PyUnicode_STATE(unicode).interned = 0;
695 _PyUnicode_STATE(unicode).kind = 0;
696 _PyUnicode_STATE(unicode).compact = 0;
697 _PyUnicode_STATE(unicode).ready = 0;
698 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200699 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200700 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200701 _PyUnicode_UTF8(unicode) = NULL;
702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000703 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000704
Benjamin Peterson29060642009-01-31 22:14:21 +0000705 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000706 /* XXX UNREF/NEWREF interface should be more symmetrical */
707 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000708 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000709 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711}
712
Victor Stinnerf42dc442011-10-02 23:33:16 +0200713static const char*
714unicode_kind_name(PyObject *unicode)
715{
Victor Stinner42dfd712011-10-03 14:41:45 +0200716 /* don't check consistency: unicode_kind_name() is called from
717 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200718 if (!PyUnicode_IS_COMPACT(unicode))
719 {
720 if (!PyUnicode_IS_READY(unicode))
721 return "wstr";
722 switch(PyUnicode_KIND(unicode))
723 {
724 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200725 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200726 return "legacy ascii";
727 else
728 return "legacy latin1";
729 case PyUnicode_2BYTE_KIND:
730 return "legacy UCS2";
731 case PyUnicode_4BYTE_KIND:
732 return "legacy UCS4";
733 default:
734 return "<legacy invalid kind>";
735 }
736 }
737 assert(PyUnicode_IS_READY(unicode));
738 switch(PyUnicode_KIND(unicode))
739 {
740 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200742 return "ascii";
743 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200744 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200745 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200746 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200747 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200748 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200749 default:
750 return "<invalid compact kind>";
751 }
752}
753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754#ifdef Py_DEBUG
755int unicode_new_new_calls = 0;
756
757/* Functions wrapping macros for use in debugger */
758char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200759 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760}
761
762void *_PyUnicode_compact_data(void *unicode) {
763 return _PyUnicode_COMPACT_DATA(unicode);
764}
765void *_PyUnicode_data(void *unicode){
766 printf("obj %p\n", unicode);
767 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
768 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
769 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
770 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
771 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
772 return PyUnicode_DATA(unicode);
773}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200774
775void
776_PyUnicode_Dump(PyObject *op)
777{
778 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200779 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
780 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
781 void *data;
782 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
783 if (ascii->state.compact)
784 data = (compact + 1);
785 else
786 data = unicode->data.any;
787 if (ascii->wstr == data)
788 printf("shared ");
789 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200790 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200791 printf(" (%zu), ", compact->wstr_length);
792 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
793 printf("shared ");
794 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200795 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200796 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200797}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200798#endif
799
800PyObject *
801PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
802{
803 PyObject *obj;
804 PyCompactUnicodeObject *unicode;
805 void *data;
806 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200807 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 Py_ssize_t char_size;
809 Py_ssize_t struct_size;
810
811 /* Optimization for empty strings */
812 if (size == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815 }
816
817#ifdef Py_DEBUG
818 ++unicode_new_new_calls;
819#endif
820
Victor Stinner9e9d6892011-10-04 01:02:02 +0200821 is_ascii = 0;
822 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823 struct_size = sizeof(PyCompactUnicodeObject);
824 if (maxchar < 128) {
825 kind_state = PyUnicode_1BYTE_KIND;
826 char_size = 1;
827 is_ascii = 1;
828 struct_size = sizeof(PyASCIIObject);
829 }
830 else if (maxchar < 256) {
831 kind_state = PyUnicode_1BYTE_KIND;
832 char_size = 1;
833 }
834 else if (maxchar < 65536) {
835 kind_state = PyUnicode_2BYTE_KIND;
836 char_size = 2;
837 if (sizeof(wchar_t) == 2)
838 is_sharing = 1;
839 }
840 else {
841 kind_state = PyUnicode_4BYTE_KIND;
842 char_size = 4;
843 if (sizeof(wchar_t) == 4)
844 is_sharing = 1;
845 }
846
847 /* Ensure we won't overflow the size. */
848 if (size < 0) {
849 PyErr_SetString(PyExc_SystemError,
850 "Negative size passed to PyUnicode_New");
851 return NULL;
852 }
853 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
854 return PyErr_NoMemory();
855
856 /* Duplicated allocation code from _PyObject_New() instead of a call to
857 * PyObject_New() so we are able to allocate space for the object and
858 * it's data buffer.
859 */
860 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
861 if (obj == NULL)
862 return PyErr_NoMemory();
863 obj = PyObject_INIT(obj, &PyUnicode_Type);
864 if (obj == NULL)
865 return NULL;
866
867 unicode = (PyCompactUnicodeObject *)obj;
868 if (is_ascii)
869 data = ((PyASCIIObject*)obj) + 1;
870 else
871 data = unicode + 1;
872 _PyUnicode_LENGTH(unicode) = size;
873 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = kind_state;
876 _PyUnicode_STATE(unicode).compact = 1;
877 _PyUnicode_STATE(unicode).ready = 1;
878 _PyUnicode_STATE(unicode).ascii = is_ascii;
879 if (is_ascii) {
880 ((char*)data)[size] = 0;
881 _PyUnicode_WSTR(unicode) = NULL;
882 }
883 else if (kind_state == PyUnicode_1BYTE_KIND) {
884 ((char*)data)[size] = 0;
885 _PyUnicode_WSTR(unicode) = NULL;
886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200888 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 }
890 else {
891 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200892 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893 if (kind_state == PyUnicode_2BYTE_KIND)
894 ((Py_UCS2*)data)[size] = 0;
895 else /* kind_state == PyUnicode_4BYTE_KIND */
896 ((Py_UCS4*)data)[size] = 0;
897 if (is_sharing) {
898 _PyUnicode_WSTR_LENGTH(unicode) = size;
899 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
900 }
901 else {
902 _PyUnicode_WSTR_LENGTH(unicode) = 0;
903 _PyUnicode_WSTR(unicode) = NULL;
904 }
905 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200906 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907 return obj;
908}
909
910#if SIZEOF_WCHAR_T == 2
911/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
912 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200913 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200914
915 This function assumes that unicode can hold one more code point than wstr
916 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200917static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
919 PyUnicodeObject *unicode)
920{
921 const wchar_t *iter;
922 Py_UCS4 *ucs4_out;
923
Victor Stinner910337b2011-10-03 03:20:16 +0200924 assert(unicode != NULL);
925 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
927 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
928
929 for (iter = begin; iter < end; ) {
930 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
931 _PyUnicode_GET_LENGTH(unicode)));
932 if (*iter >= 0xD800 && *iter <= 0xDBFF
933 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
934 {
935 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
936 iter += 2;
937 }
938 else {
939 *ucs4_out++ = *iter;
940 iter++;
941 }
942 }
943 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
944 _PyUnicode_GET_LENGTH(unicode)));
945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946}
947#endif
948
Victor Stinnercd9950f2011-10-02 00:34:53 +0200949static int
950_PyUnicode_Dirty(PyObject *unicode)
951{
Victor Stinner910337b2011-10-03 03:20:16 +0200952 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200953 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200954 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200955 "Cannot modify a string having more than 1 reference");
956 return -1;
957 }
958 _PyUnicode_DIRTY(unicode);
959 return 0;
960}
961
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200962static int
963_copy_characters(PyObject *to, Py_ssize_t to_start,
964 PyObject *from, Py_ssize_t from_start,
965 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200967 unsigned int from_kind, to_kind;
968 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200969 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200971 assert(PyUnicode_Check(from));
972 assert(PyUnicode_Check(to));
973 assert(PyUnicode_IS_READY(from));
974 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200976 assert(PyUnicode_GET_LENGTH(from) >= how_many);
977 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
978 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200980 if (how_many == 0)
981 return 0;
982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200984 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200986 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200988#ifdef Py_DEBUG
989 if (!check_maxchar
990 && (from_kind > to_kind
991 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200992 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200993 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
994 Py_UCS4 ch;
995 Py_ssize_t i;
996 for (i=0; i < how_many; i++) {
997 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
998 assert(ch <= to_maxchar);
999 }
1000 }
1001#endif
1002 fast = (from_kind == to_kind);
1003 if (check_maxchar
1004 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1005 {
1006 /* deny latin1 => ascii */
1007 fast = 0;
1008 }
1009
1010 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001011 Py_MEMCPY((char*)to_data + to_kind * to_start,
1012 (char*)from_data + from_kind * from_start,
1013 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001014 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001015 else if (from_kind == PyUnicode_1BYTE_KIND
1016 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001017 {
1018 _PyUnicode_CONVERT_BYTES(
1019 Py_UCS1, Py_UCS2,
1020 PyUnicode_1BYTE_DATA(from) + from_start,
1021 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1022 PyUnicode_2BYTE_DATA(to) + to_start
1023 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001024 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001025 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001026 && to_kind == PyUnicode_4BYTE_KIND)
1027 {
1028 _PyUnicode_CONVERT_BYTES(
1029 Py_UCS1, Py_UCS4,
1030 PyUnicode_1BYTE_DATA(from) + from_start,
1031 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1032 PyUnicode_4BYTE_DATA(to) + to_start
1033 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001034 }
1035 else if (from_kind == PyUnicode_2BYTE_KIND
1036 && to_kind == PyUnicode_4BYTE_KIND)
1037 {
1038 _PyUnicode_CONVERT_BYTES(
1039 Py_UCS2, Py_UCS4,
1040 PyUnicode_2BYTE_DATA(from) + from_start,
1041 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1042 PyUnicode_4BYTE_DATA(to) + to_start
1043 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001044 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001045 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001046 /* check if max_char(from substring) <= max_char(to) */
1047 if (from_kind > to_kind
1048 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001049 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001050 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001051 /* slow path to check for character overflow */
1052 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001054 Py_ssize_t i;
1055
Victor Stinner56c161a2011-10-06 02:47:11 +02001056#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001057 for (i=0; i < how_many; i++) {
1058 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001059 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001060 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1061 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001062#else
1063 if (!check_maxchar) {
1064 for (i=0; i < how_many; i++) {
1065 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1066 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1067 }
1068 }
1069 else {
1070 for (i=0; i < how_many; i++) {
1071 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1072 if (ch > to_maxchar)
1073 return 1;
1074 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1075 }
1076 }
1077#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001078 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001079 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001080 assert(0 && "inconsistent state");
1081 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001082 }
1083 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001084 return 0;
1085}
1086
1087static void
1088copy_characters(PyObject *to, Py_ssize_t to_start,
1089 PyObject *from, Py_ssize_t from_start,
1090 Py_ssize_t how_many)
1091{
1092 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1093}
1094
1095Py_ssize_t
1096PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1097 PyObject *from, Py_ssize_t from_start,
1098 Py_ssize_t how_many)
1099{
1100 int err;
1101
1102 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1103 PyErr_BadInternalCall();
1104 return -1;
1105 }
1106
1107 if (PyUnicode_READY(from))
1108 return -1;
1109 if (PyUnicode_READY(to))
1110 return -1;
1111
1112 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1113 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1114 PyErr_Format(PyExc_SystemError,
1115 "Cannot write %zi characters at %zi "
1116 "in a string of %zi characters",
1117 how_many, to_start, PyUnicode_GET_LENGTH(to));
1118 return -1;
1119 }
1120
1121 if (how_many == 0)
1122 return 0;
1123
1124 if (_PyUnicode_Dirty(to))
1125 return -1;
1126
1127 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1128 if (err) {
1129 PyErr_Format(PyExc_SystemError,
1130 "Cannot copy %s characters "
1131 "into a string of %s characters",
1132 unicode_kind_name(from),
1133 unicode_kind_name(to));
1134 return -1;
1135 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001136 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137}
1138
Victor Stinner17222162011-09-28 22:15:37 +02001139/* Find the maximum code point and count the number of surrogate pairs so a
1140 correct string length can be computed before converting a string to UCS4.
1141 This function counts single surrogates as a character and not as a pair.
1142
1143 Return 0 on success, or -1 on error. */
1144static int
1145find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1146 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147{
1148 const wchar_t *iter;
1149
Victor Stinnerc53be962011-10-02 21:33:54 +02001150 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151 *num_surrogates = 0;
1152 *maxchar = 0;
1153
1154 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001155 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001157#if SIZEOF_WCHAR_T != 2
1158 if (*maxchar >= 0x10000)
1159 return 0;
1160#endif
1161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162#if SIZEOF_WCHAR_T == 2
1163 if (*iter >= 0xD800 && *iter <= 0xDBFF
1164 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1165 {
1166 Py_UCS4 surrogate_val;
1167 surrogate_val = (((iter[0] & 0x3FF)<<10)
1168 | (iter[1] & 0x3FF)) + 0x10000;
1169 ++(*num_surrogates);
1170 if (surrogate_val > *maxchar)
1171 *maxchar = surrogate_val;
1172 iter += 2;
1173 }
1174 else
1175 iter++;
1176#else
1177 iter++;
1178#endif
1179 }
1180 return 0;
1181}
1182
1183#ifdef Py_DEBUG
1184int unicode_ready_calls = 0;
1185#endif
1186
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001187static int
1188unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001190 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191 wchar_t *end;
1192 Py_UCS4 maxchar = 0;
1193 Py_ssize_t num_surrogates;
1194#if SIZEOF_WCHAR_T == 2
1195 Py_ssize_t length_wo_surrogates;
1196#endif
1197
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001198 assert(p_obj != NULL);
1199 unicode = (PyUnicodeObject *)*p_obj;
1200
Georg Brandl7597add2011-10-05 16:36:47 +02001201 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001202 strings were created using _PyObject_New() and where no canonical
1203 representation (the str field) has been set yet aka strings
1204 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001205 assert(_PyUnicode_CHECK(unicode));
1206 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001208 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001209 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001210 /* Actually, it should neither be interned nor be anything else: */
1211 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212
1213#ifdef Py_DEBUG
1214 ++unicode_ready_calls;
1215#endif
1216
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001217#ifdef Py_DEBUG
1218 assert(!replace || Py_REFCNT(unicode) == 1);
1219#else
1220 if (replace && Py_REFCNT(unicode) != 1)
1221 replace = 0;
1222#endif
1223 if (replace) {
1224 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1225 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1226 /* Optimization for empty strings */
1227 if (len == 0) {
1228 Py_INCREF(unicode_empty);
1229 Py_DECREF(*p_obj);
1230 *p_obj = unicode_empty;
1231 return 0;
1232 }
1233 if (len == 1 && wstr[0] < 256) {
1234 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1235 if (latin1_char == NULL)
1236 return -1;
1237 Py_DECREF(*p_obj);
1238 *p_obj = latin1_char;
1239 return 0;
1240 }
1241 }
1242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001244 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001245 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247
1248 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1250 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 PyErr_NoMemory();
1252 return -1;
1253 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001254 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 _PyUnicode_WSTR(unicode), end,
1256 PyUnicode_1BYTE_DATA(unicode));
1257 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1258 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1259 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1260 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001261 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001262 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001263 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 }
1265 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001266 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001267 _PyUnicode_UTF8(unicode) = NULL;
1268 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 }
1270 PyObject_FREE(_PyUnicode_WSTR(unicode));
1271 _PyUnicode_WSTR(unicode) = NULL;
1272 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1273 }
1274 /* In this case we might have to convert down from 4-byte native
1275 wchar_t to 2-byte unicode. */
1276 else if (maxchar < 65536) {
1277 assert(num_surrogates == 0 &&
1278 "FindMaxCharAndNumSurrogatePairs() messed up");
1279
Victor Stinner506f5922011-09-28 22:34:18 +02001280#if SIZEOF_WCHAR_T == 2
1281 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001282 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001283 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1284 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1285 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001286 _PyUnicode_UTF8(unicode) = NULL;
1287 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001288#else
1289 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001290 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001291 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001292 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001293 PyErr_NoMemory();
1294 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295 }
Victor Stinner506f5922011-09-28 22:34:18 +02001296 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1297 _PyUnicode_WSTR(unicode), end,
1298 PyUnicode_2BYTE_DATA(unicode));
1299 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1300 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1301 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001302 _PyUnicode_UTF8(unicode) = NULL;
1303 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001304 PyObject_FREE(_PyUnicode_WSTR(unicode));
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1307#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 }
1309 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1310 else {
1311#if SIZEOF_WCHAR_T == 2
1312 /* in case the native representation is 2-bytes, we need to allocate a
1313 new normalized 4-byte version. */
1314 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001315 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1316 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 PyErr_NoMemory();
1318 return -1;
1319 }
1320 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1321 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001322 _PyUnicode_UTF8(unicode) = NULL;
1323 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001324 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1325 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001326 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 PyObject_FREE(_PyUnicode_WSTR(unicode));
1328 _PyUnicode_WSTR(unicode) = NULL;
1329 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1330#else
1331 assert(num_surrogates == 0);
1332
Victor Stinnerc3c74152011-10-02 20:39:55 +02001333 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001335 _PyUnicode_UTF8(unicode) = NULL;
1336 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1338#endif
1339 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1340 }
1341 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001342 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 return 0;
1344}
1345
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001346int
1347_PyUnicode_ReadyReplace(PyObject **op)
1348{
1349 return unicode_ready(op, 1);
1350}
1351
1352int
1353_PyUnicode_Ready(PyObject *op)
1354{
1355 return unicode_ready(&op, 0);
1356}
1357
Alexander Belopolsky40018472011-02-26 01:02:56 +00001358static void
1359unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360{
Walter Dörwald16807132007-05-25 13:52:07 +00001361 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 case SSTATE_NOT_INTERNED:
1363 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001364
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 case SSTATE_INTERNED_MORTAL:
1366 /* revive dead object temporarily for DelItem */
1367 Py_REFCNT(unicode) = 3;
1368 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1369 Py_FatalError(
1370 "deletion of interned string failed");
1371 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001372
Benjamin Peterson29060642009-01-31 22:14:21 +00001373 case SSTATE_INTERNED_IMMORTAL:
1374 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001375
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 default:
1377 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001378 }
1379
Victor Stinner03490912011-10-03 23:45:12 +02001380 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001382 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001383 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384
1385 if (PyUnicode_IS_COMPACT(unicode)) {
1386 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
1388 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001389 if (_PyUnicode_DATA_ANY(unicode))
1390 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001391 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
1393}
1394
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395#ifdef Py_DEBUG
1396static int
1397unicode_is_singleton(PyObject *unicode)
1398{
1399 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1400 if (unicode == unicode_empty)
1401 return 1;
1402 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1403 {
1404 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1405 if (ch < 256 && unicode_latin1[ch] == unicode)
1406 return 1;
1407 }
1408 return 0;
1409}
1410#endif
1411
Alexander Belopolsky40018472011-02-26 01:02:56 +00001412static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001413unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001415 if (Py_REFCNT(unicode) != 1)
1416 return 0;
1417 if (PyUnicode_CHECK_INTERNED(unicode))
1418 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001419#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001420 /* singleton refcount is greater than 1 */
1421 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001422#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001423 return 1;
1424}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001425
Victor Stinnerfe226c02011-10-03 03:52:20 +02001426static int
1427unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1428{
1429 PyObject *unicode;
1430 Py_ssize_t old_length;
1431
1432 assert(p_unicode != NULL);
1433 unicode = *p_unicode;
1434
1435 assert(unicode != NULL);
1436 assert(PyUnicode_Check(unicode));
1437 assert(0 <= length);
1438
Victor Stinner910337b2011-10-03 03:20:16 +02001439 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001440 old_length = PyUnicode_WSTR_LENGTH(unicode);
1441 else
1442 old_length = PyUnicode_GET_LENGTH(unicode);
1443 if (old_length == length)
1444 return 0;
1445
Victor Stinnerfe226c02011-10-03 03:52:20 +02001446 if (!unicode_resizable(unicode)) {
1447 PyObject *copy = resize_copy(unicode, length);
1448 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001450 Py_DECREF(*p_unicode);
1451 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001453 }
1454
Victor Stinnerfe226c02011-10-03 03:52:20 +02001455 if (PyUnicode_IS_COMPACT(unicode)) {
1456 *p_unicode = resize_compact(unicode, length);
1457 if (*p_unicode == NULL)
1458 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001459 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001460 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001461 }
1462 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001463}
1464
Alexander Belopolsky40018472011-02-26 01:02:56 +00001465int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001466PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001467{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001468 PyObject *unicode;
1469 if (p_unicode == NULL) {
1470 PyErr_BadInternalCall();
1471 return -1;
1472 }
1473 unicode = *p_unicode;
1474 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1475 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1476 {
1477 PyErr_BadInternalCall();
1478 return -1;
1479 }
1480 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001481}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483static PyObject*
1484get_latin1_char(unsigned char ch)
1485{
Victor Stinnera464fc12011-10-02 20:39:30 +02001486 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001488 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 if (!unicode)
1490 return NULL;
1491 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001492 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 unicode_latin1[ch] = unicode;
1494 }
1495 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001496 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497}
1498
Alexander Belopolsky40018472011-02-26 01:02:56 +00001499PyObject *
1500PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501{
1502 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503 Py_UCS4 maxchar = 0;
1504 Py_ssize_t num_surrogates;
1505
1506 if (u == NULL)
1507 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001509 /* If the Unicode data is known at construction time, we can apply
1510 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 /* Optimization for empty strings */
1513 if (size == 0 && unicode_empty != NULL) {
1514 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001515 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001516 }
Tim Petersced69f82003-09-16 20:30:58 +00001517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 /* Single character Unicode objects in the Latin-1 range are
1519 shared when using this constructor */
1520 if (size == 1 && *u < 256)
1521 return get_latin1_char((unsigned char)*u);
1522
1523 /* If not empty and not single character, copy the Unicode data
1524 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001525 if (find_maxchar_surrogates(u, u + size,
1526 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 return NULL;
1528
1529 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1530 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 if (!unicode)
1532 return NULL;
1533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 switch (PyUnicode_KIND(unicode)) {
1535 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001536 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1538 break;
1539 case PyUnicode_2BYTE_KIND:
1540#if Py_UNICODE_SIZE == 2
1541 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1542#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001543 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1545#endif
1546 break;
1547 case PyUnicode_4BYTE_KIND:
1548#if SIZEOF_WCHAR_T == 2
1549 /* This is the only case which has to process surrogates, thus
1550 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001551 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552#else
1553 assert(num_surrogates == 0);
1554 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1555#endif
1556 break;
1557 default:
1558 assert(0 && "Impossible state");
1559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001561 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 return (PyObject *)unicode;
1563}
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565PyObject *
1566PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001567{
1568 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001569
Benjamin Peterson14339b62009-01-31 16:36:08 +00001570 if (size < 0) {
1571 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001572 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001573 return NULL;
1574 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001575
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001576 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001577 some optimizations which share commonly used objects.
1578 Also, this means the input must be UTF-8, so fall back to the
1579 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001580 if (u != NULL) {
1581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 /* Optimization for empty strings */
1583 if (size == 0 && unicode_empty != NULL) {
1584 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001585 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001586 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001587
1588 /* Single characters are shared when using this constructor.
1589 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 if (size == 1 && Py_CHARMASK(*u) < 128)
1591 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001592
1593 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001594 }
1595
Walter Dörwald55507312007-05-18 13:12:10 +00001596 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001597 if (!unicode)
1598 return NULL;
1599
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001600 return (PyObject *)unicode;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001605{
1606 size_t size = strlen(u);
1607 if (size > PY_SSIZE_T_MAX) {
1608 PyErr_SetString(PyExc_OverflowError, "input too long");
1609 return NULL;
1610 }
1611
1612 return PyUnicode_FromStringAndSize(u, size);
1613}
1614
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001615PyObject *
1616_PyUnicode_FromId(_Py_Identifier *id)
1617{
1618 if (!id->object) {
1619 id->object = PyUnicode_FromString(id->string);
1620 if (!id->object)
1621 return NULL;
1622 PyUnicode_InternInPlace(&id->object);
1623 assert(!id->next);
1624 id->next = static_strings;
1625 static_strings = id;
1626 }
1627 Py_INCREF(id->object);
1628 return id->object;
1629}
1630
1631void
1632_PyUnicode_ClearStaticStrings()
1633{
1634 _Py_Identifier *i;
1635 for (i = static_strings; i; i = i->next) {
1636 Py_DECREF(i->object);
1637 i->object = NULL;
1638 i->next = NULL;
1639 }
1640}
1641
Victor Stinnere57b1c02011-09-28 22:20:48 +02001642static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001643unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001644{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001645 PyObject *res;
1646#ifdef Py_DEBUG
1647 const unsigned char *p;
1648 const unsigned char *end = s + size;
1649 for (p=s; p < end; p++) {
1650 assert(*p < 128);
1651 }
1652#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001653 if (size == 1)
1654 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001655 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001656 if (!res)
1657 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001658 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001659 return res;
1660}
1661
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001662static Py_UCS4
1663kind_maxchar_limit(unsigned int kind)
1664{
1665 switch(kind) {
1666 case PyUnicode_1BYTE_KIND:
1667 return 0x80;
1668 case PyUnicode_2BYTE_KIND:
1669 return 0x100;
1670 case PyUnicode_4BYTE_KIND:
1671 return 0x10000;
1672 default:
1673 assert(0 && "invalid kind");
1674 return 0x10ffff;
1675 }
1676}
1677
Victor Stinner702c7342011-10-05 13:50:52 +02001678static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001679_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001682 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001684
1685 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001686 if (size == 1)
1687 return get_latin1_char(u[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 for (i = 0; i < size; i++) {
1689 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001690 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001692 }
1693 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001694 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695 if (!res)
1696 return NULL;
1697 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001698 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001700}
1701
Victor Stinnere57b1c02011-09-28 22:20:48 +02001702static PyObject*
1703_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704{
1705 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001706 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001708
1709 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001710 if (size == 1 && u[0] < 256)
1711 return get_latin1_char(u[0]);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001712 for (i = 0; i < size; i++) {
1713 if (u[i] > max_char) {
1714 max_char = u[i];
1715 if (max_char >= 256)
1716 break;
1717 }
1718 }
1719 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 if (!res)
1721 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001722 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1724 else
1725 for (i = 0; i < size; i++)
1726 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001727 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 return res;
1729}
1730
Victor Stinnere57b1c02011-09-28 22:20:48 +02001731static PyObject*
1732_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733{
1734 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001735 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001737
1738 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001739 if (size == 1 && u[0] < 256)
1740 return get_latin1_char(u[0]);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001741 for (i = 0; i < size; i++) {
1742 if (u[i] > max_char) {
1743 max_char = u[i];
1744 if (max_char >= 0x10000)
1745 break;
1746 }
1747 }
1748 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 if (!res)
1750 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001751 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1753 else {
1754 int kind = PyUnicode_KIND(res);
1755 void *data = PyUnicode_DATA(res);
1756 for (i = 0; i < size; i++)
1757 PyUnicode_WRITE(kind, data, i, u[i]);
1758 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001759 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return res;
1761}
1762
1763PyObject*
1764PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1765{
1766 switch(kind) {
1767 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001768 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001770 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001772 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001773 default:
1774 assert(0 && "invalid kind");
1775 PyErr_SetString(PyExc_SystemError, "invalid kind");
1776 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778}
1779
Victor Stinner25a4b292011-10-06 12:31:55 +02001780/* Ensure that a string uses the most efficient storage, if it is not the
1781 case: create a new string with of the right kind. Write NULL into *p_unicode
1782 on error. */
1783void
1784unicode_adjust_maxchar(PyObject **p_unicode)
1785{
1786 PyObject *unicode, *copy;
1787 Py_UCS4 max_char;
1788 Py_ssize_t i, len;
1789 unsigned int kind;
1790
1791 assert(p_unicode != NULL);
1792 unicode = *p_unicode;
1793 assert(PyUnicode_IS_READY(unicode));
1794 if (PyUnicode_IS_ASCII(unicode))
1795 return;
1796
1797 len = PyUnicode_GET_LENGTH(unicode);
1798 kind = PyUnicode_KIND(unicode);
1799 if (kind == PyUnicode_1BYTE_KIND) {
1800 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1801 for (i = 0; i < len; i++) {
1802 if (u[i] & 0x80)
1803 return;
1804 }
1805 max_char = 127;
1806 }
1807 else if (kind == PyUnicode_2BYTE_KIND) {
1808 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1809 max_char = 0;
1810 for (i = 0; i < len; i++) {
1811 if (u[i] > max_char) {
1812 max_char = u[i];
1813 if (max_char >= 256)
1814 return;
1815 }
1816 }
1817 }
1818 else {
Antoine Pitrou15a66cf2011-10-06 15:25:32 +02001819 const Py_UCS4 *u;
Victor Stinner25a4b292011-10-06 12:31:55 +02001820 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitrou15a66cf2011-10-06 15:25:32 +02001821 u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001822 max_char = 0;
1823 for (i = 0; i < len; i++) {
1824 if (u[i] > max_char) {
1825 max_char = u[i];
1826 if (max_char >= 0x10000)
1827 return;
1828 }
1829 }
1830 }
Victor Stinner200f2132011-10-06 13:27:56 +02001831 assert(max_char < PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinner25a4b292011-10-06 12:31:55 +02001832 copy = PyUnicode_New(len, max_char);
1833 copy_characters(copy, 0, unicode, 0, len);
1834 Py_DECREF(unicode);
1835 *p_unicode = copy;
1836}
1837
Victor Stinner034f6cf2011-09-30 02:26:44 +02001838PyObject*
1839PyUnicode_Copy(PyObject *unicode)
1840{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001841 Py_ssize_t size;
1842 PyObject *copy;
1843 void *data;
1844
Victor Stinner034f6cf2011-09-30 02:26:44 +02001845 if (!PyUnicode_Check(unicode)) {
1846 PyErr_BadInternalCall();
1847 return NULL;
1848 }
1849 if (PyUnicode_READY(unicode))
1850 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001851
1852 size = PyUnicode_GET_LENGTH(unicode);
1853 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1854 if (!copy)
1855 return NULL;
1856 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1857
1858 data = PyUnicode_DATA(unicode);
1859 switch (PyUnicode_KIND(unicode))
1860 {
1861 case PyUnicode_1BYTE_KIND:
1862 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1863 break;
1864 case PyUnicode_2BYTE_KIND:
1865 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1866 break;
1867 case PyUnicode_4BYTE_KIND:
1868 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1869 break;
1870 default:
1871 assert(0);
1872 break;
1873 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001874 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001875 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001876}
1877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878
Victor Stinnerbc603d12011-10-02 01:00:40 +02001879/* Widen Unicode objects to larger buffers. Don't write terminating null
1880 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881
1882void*
1883_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1884{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001885 Py_ssize_t len;
1886 void *result;
1887 unsigned int skind;
1888
1889 if (PyUnicode_READY(s))
1890 return NULL;
1891
1892 len = PyUnicode_GET_LENGTH(s);
1893 skind = PyUnicode_KIND(s);
1894 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001895 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 return NULL;
1897 }
1898 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001899 case PyUnicode_2BYTE_KIND:
1900 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1901 if (!result)
1902 return PyErr_NoMemory();
1903 assert(skind == PyUnicode_1BYTE_KIND);
1904 _PyUnicode_CONVERT_BYTES(
1905 Py_UCS1, Py_UCS2,
1906 PyUnicode_1BYTE_DATA(s),
1907 PyUnicode_1BYTE_DATA(s) + len,
1908 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001910 case PyUnicode_4BYTE_KIND:
1911 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1912 if (!result)
1913 return PyErr_NoMemory();
1914 if (skind == PyUnicode_2BYTE_KIND) {
1915 _PyUnicode_CONVERT_BYTES(
1916 Py_UCS2, Py_UCS4,
1917 PyUnicode_2BYTE_DATA(s),
1918 PyUnicode_2BYTE_DATA(s) + len,
1919 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001921 else {
1922 assert(skind == PyUnicode_1BYTE_KIND);
1923 _PyUnicode_CONVERT_BYTES(
1924 Py_UCS1, Py_UCS4,
1925 PyUnicode_1BYTE_DATA(s),
1926 PyUnicode_1BYTE_DATA(s) + len,
1927 result);
1928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001930 default:
1931 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 }
Victor Stinner01698042011-10-04 00:04:26 +02001933 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 return NULL;
1935}
1936
1937static Py_UCS4*
1938as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1939 int copy_null)
1940{
1941 int kind;
1942 void *data;
1943 Py_ssize_t len, targetlen;
1944 if (PyUnicode_READY(string) == -1)
1945 return NULL;
1946 kind = PyUnicode_KIND(string);
1947 data = PyUnicode_DATA(string);
1948 len = PyUnicode_GET_LENGTH(string);
1949 targetlen = len;
1950 if (copy_null)
1951 targetlen++;
1952 if (!target) {
1953 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1954 PyErr_NoMemory();
1955 return NULL;
1956 }
1957 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1958 if (!target) {
1959 PyErr_NoMemory();
1960 return NULL;
1961 }
1962 }
1963 else {
1964 if (targetsize < targetlen) {
1965 PyErr_Format(PyExc_SystemError,
1966 "string is longer than the buffer");
1967 if (copy_null && 0 < targetsize)
1968 target[0] = 0;
1969 return NULL;
1970 }
1971 }
1972 if (kind != PyUnicode_4BYTE_KIND) {
1973 Py_ssize_t i;
1974 for (i = 0; i < len; i++)
1975 target[i] = PyUnicode_READ(kind, data, i);
1976 }
1977 else
1978 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1979 if (copy_null)
1980 target[len] = 0;
1981 return target;
1982}
1983
1984Py_UCS4*
1985PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1986 int copy_null)
1987{
1988 if (target == NULL || targetsize < 1) {
1989 PyErr_BadInternalCall();
1990 return NULL;
1991 }
1992 return as_ucs4(string, target, targetsize, copy_null);
1993}
1994
1995Py_UCS4*
1996PyUnicode_AsUCS4Copy(PyObject *string)
1997{
1998 return as_ucs4(string, NULL, 0, 1);
1999}
2000
2001#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002002
Alexander Belopolsky40018472011-02-26 01:02:56 +00002003PyObject *
2004PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002007 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 PyErr_BadInternalCall();
2010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 }
2012
Martin v. Löwis790465f2008-04-05 20:41:37 +00002013 if (size == -1) {
2014 size = wcslen(w);
2015 }
2016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018}
2019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002021
Walter Dörwald346737f2007-05-31 10:44:43 +00002022static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002023makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2024 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002025{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002026 *fmt++ = '%';
2027 if (width) {
2028 if (zeropad)
2029 *fmt++ = '0';
2030 fmt += sprintf(fmt, "%d", width);
2031 }
2032 if (precision)
2033 fmt += sprintf(fmt, ".%d", precision);
2034 if (longflag)
2035 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002036 else if (longlongflag) {
2037 /* longlongflag should only ever be nonzero on machines with
2038 HAVE_LONG_LONG defined */
2039#ifdef HAVE_LONG_LONG
2040 char *f = PY_FORMAT_LONG_LONG;
2041 while (*f)
2042 *fmt++ = *f++;
2043#else
2044 /* we shouldn't ever get here */
2045 assert(0);
2046 *fmt++ = 'l';
2047#endif
2048 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002049 else if (size_tflag) {
2050 char *f = PY_FORMAT_SIZE_T;
2051 while (*f)
2052 *fmt++ = *f++;
2053 }
2054 *fmt++ = c;
2055 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002056}
2057
Victor Stinner96865452011-03-01 23:44:09 +00002058/* helper for PyUnicode_FromFormatV() */
2059
2060static const char*
2061parse_format_flags(const char *f,
2062 int *p_width, int *p_precision,
2063 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2064{
2065 int width, precision, longflag, longlongflag, size_tflag;
2066
2067 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2068 f++;
2069 width = 0;
2070 while (Py_ISDIGIT((unsigned)*f))
2071 width = (width*10) + *f++ - '0';
2072 precision = 0;
2073 if (*f == '.') {
2074 f++;
2075 while (Py_ISDIGIT((unsigned)*f))
2076 precision = (precision*10) + *f++ - '0';
2077 if (*f == '%') {
2078 /* "%.3%s" => f points to "3" */
2079 f--;
2080 }
2081 }
2082 if (*f == '\0') {
2083 /* bogus format "%.1" => go backward, f points to "1" */
2084 f--;
2085 }
2086 if (p_width != NULL)
2087 *p_width = width;
2088 if (p_precision != NULL)
2089 *p_precision = precision;
2090
2091 /* Handle %ld, %lu, %lld and %llu. */
2092 longflag = 0;
2093 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002094 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002095
2096 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002097 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002098 longflag = 1;
2099 ++f;
2100 }
2101#ifdef HAVE_LONG_LONG
2102 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002103 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002104 longlongflag = 1;
2105 f += 2;
2106 }
2107#endif
2108 }
2109 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002110 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002111 size_tflag = 1;
2112 ++f;
2113 }
2114 if (p_longflag != NULL)
2115 *p_longflag = longflag;
2116 if (p_longlongflag != NULL)
2117 *p_longlongflag = longlongflag;
2118 if (p_size_tflag != NULL)
2119 *p_size_tflag = size_tflag;
2120 return f;
2121}
2122
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002123/* maximum number of characters required for output of %ld. 21 characters
2124 allows for 64-bit integers (in decimal) and an optional sign. */
2125#define MAX_LONG_CHARS 21
2126/* maximum number of characters required for output of %lld.
2127 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2128 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2129#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2130
Walter Dörwaldd2034312007-05-18 16:29:38 +00002131PyObject *
2132PyUnicode_FromFormatV(const char *format, va_list vargs)
2133{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002134 va_list count;
2135 Py_ssize_t callcount = 0;
2136 PyObject **callresults = NULL;
2137 PyObject **callresult = NULL;
2138 Py_ssize_t n = 0;
2139 int width = 0;
2140 int precision = 0;
2141 int zeropad;
2142 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002143 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002145 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2147 Py_UCS4 argmaxchar;
2148 Py_ssize_t numbersize = 0;
2149 char *numberresults = NULL;
2150 char *numberresult = NULL;
2151 Py_ssize_t i;
2152 int kind;
2153 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002154
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002155 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002156 /* step 1: count the number of %S/%R/%A/%s format specifications
2157 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2158 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002160 * also estimate a upper bound for all the number formats in the string,
2161 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002163 for (f = format; *f; f++) {
2164 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002165 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2167 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2168 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2169 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002172#ifdef HAVE_LONG_LONG
2173 if (longlongflag) {
2174 if (width < MAX_LONG_LONG_CHARS)
2175 width = MAX_LONG_LONG_CHARS;
2176 }
2177 else
2178#endif
2179 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2180 including sign. Decimal takes the most space. This
2181 isn't enough for octal. If a width is specified we
2182 need more (which we allocate later). */
2183 if (width < MAX_LONG_CHARS)
2184 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185
2186 /* account for the size + '\0' to separate numbers
2187 inside of the numberresults buffer */
2188 numbersize += (width + 1);
2189 }
2190 }
2191 else if ((unsigned char)*f > 127) {
2192 PyErr_Format(PyExc_ValueError,
2193 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2194 "string, got a non-ASCII byte: 0x%02x",
2195 (unsigned char)*f);
2196 return NULL;
2197 }
2198 }
2199 /* step 2: allocate memory for the results of
2200 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2201 if (callcount) {
2202 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2203 if (!callresults) {
2204 PyErr_NoMemory();
2205 return NULL;
2206 }
2207 callresult = callresults;
2208 }
2209 /* step 2.5: allocate memory for the results of formating numbers */
2210 if (numbersize) {
2211 numberresults = PyObject_Malloc(numbersize);
2212 if (!numberresults) {
2213 PyErr_NoMemory();
2214 goto fail;
2215 }
2216 numberresult = numberresults;
2217 }
2218
2219 /* step 3: format numbers and figure out how large a buffer we need */
2220 for (f = format; *f; f++) {
2221 if (*f == '%') {
2222 const char* p;
2223 int longflag;
2224 int longlongflag;
2225 int size_tflag;
2226 int numprinted;
2227
2228 p = f;
2229 zeropad = (f[1] == '0');
2230 f = parse_format_flags(f, &width, &precision,
2231 &longflag, &longlongflag, &size_tflag);
2232 switch (*f) {
2233 case 'c':
2234 {
2235 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002236 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 n++;
2238 break;
2239 }
2240 case '%':
2241 n++;
2242 break;
2243 case 'i':
2244 case 'd':
2245 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2246 width, precision, *f);
2247 if (longflag)
2248 numprinted = sprintf(numberresult, fmt,
2249 va_arg(count, long));
2250#ifdef HAVE_LONG_LONG
2251 else if (longlongflag)
2252 numprinted = sprintf(numberresult, fmt,
2253 va_arg(count, PY_LONG_LONG));
2254#endif
2255 else if (size_tflag)
2256 numprinted = sprintf(numberresult, fmt,
2257 va_arg(count, Py_ssize_t));
2258 else
2259 numprinted = sprintf(numberresult, fmt,
2260 va_arg(count, int));
2261 n += numprinted;
2262 /* advance by +1 to skip over the '\0' */
2263 numberresult += (numprinted + 1);
2264 assert(*(numberresult - 1) == '\0');
2265 assert(*(numberresult - 2) != '\0');
2266 assert(numprinted >= 0);
2267 assert(numberresult <= numberresults + numbersize);
2268 break;
2269 case 'u':
2270 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2271 width, precision, 'u');
2272 if (longflag)
2273 numprinted = sprintf(numberresult, fmt,
2274 va_arg(count, unsigned long));
2275#ifdef HAVE_LONG_LONG
2276 else if (longlongflag)
2277 numprinted = sprintf(numberresult, fmt,
2278 va_arg(count, unsigned PY_LONG_LONG));
2279#endif
2280 else if (size_tflag)
2281 numprinted = sprintf(numberresult, fmt,
2282 va_arg(count, size_t));
2283 else
2284 numprinted = sprintf(numberresult, fmt,
2285 va_arg(count, unsigned int));
2286 n += numprinted;
2287 numberresult += (numprinted + 1);
2288 assert(*(numberresult - 1) == '\0');
2289 assert(*(numberresult - 2) != '\0');
2290 assert(numprinted >= 0);
2291 assert(numberresult <= numberresults + numbersize);
2292 break;
2293 case 'x':
2294 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2295 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2296 n += numprinted;
2297 numberresult += (numprinted + 1);
2298 assert(*(numberresult - 1) == '\0');
2299 assert(*(numberresult - 2) != '\0');
2300 assert(numprinted >= 0);
2301 assert(numberresult <= numberresults + numbersize);
2302 break;
2303 case 'p':
2304 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2305 /* %p is ill-defined: ensure leading 0x. */
2306 if (numberresult[1] == 'X')
2307 numberresult[1] = 'x';
2308 else if (numberresult[1] != 'x') {
2309 memmove(numberresult + 2, numberresult,
2310 strlen(numberresult) + 1);
2311 numberresult[0] = '0';
2312 numberresult[1] = 'x';
2313 numprinted += 2;
2314 }
2315 n += numprinted;
2316 numberresult += (numprinted + 1);
2317 assert(*(numberresult - 1) == '\0');
2318 assert(*(numberresult - 2) != '\0');
2319 assert(numprinted >= 0);
2320 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 break;
2322 case 's':
2323 {
2324 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002325 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002326 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2327 if (!str)
2328 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 /* since PyUnicode_DecodeUTF8 returns already flexible
2330 unicode objects, there is no need to call ready on them */
2331 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002332 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002334 /* Remember the str and switch to the next slot */
2335 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002336 break;
2337 }
2338 case 'U':
2339 {
2340 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002341 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 if (PyUnicode_READY(obj) == -1)
2343 goto fail;
2344 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002345 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002346 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002347 break;
2348 }
2349 case 'V':
2350 {
2351 PyObject *obj = va_arg(count, PyObject *);
2352 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002353 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002354 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002355 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002356 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357 if (PyUnicode_READY(obj) == -1)
2358 goto fail;
2359 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002360 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002362 *callresult++ = NULL;
2363 }
2364 else {
2365 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2366 if (!str_obj)
2367 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002368 if (PyUnicode_READY(str_obj)) {
2369 Py_DECREF(str_obj);
2370 goto fail;
2371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002373 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002375 *callresult++ = str_obj;
2376 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002377 break;
2378 }
2379 case 'S':
2380 {
2381 PyObject *obj = va_arg(count, PyObject *);
2382 PyObject *str;
2383 assert(obj);
2384 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002386 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002388 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 /* Remember the str and switch to the next slot */
2391 *callresult++ = str;
2392 break;
2393 }
2394 case 'R':
2395 {
2396 PyObject *obj = va_arg(count, PyObject *);
2397 PyObject *repr;
2398 assert(obj);
2399 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002403 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002405 /* Remember the repr and switch to the next slot */
2406 *callresult++ = repr;
2407 break;
2408 }
2409 case 'A':
2410 {
2411 PyObject *obj = va_arg(count, PyObject *);
2412 PyObject *ascii;
2413 assert(obj);
2414 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002416 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002418 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 /* Remember the repr and switch to the next slot */
2421 *callresult++ = ascii;
2422 break;
2423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002424 default:
2425 /* if we stumble upon an unknown
2426 formatting code, copy the rest of
2427 the format string to the output
2428 string. (we cannot just skip the
2429 code, since there's no way to know
2430 what's in the argument list) */
2431 n += strlen(p);
2432 goto expand;
2433 }
2434 } else
2435 n++;
2436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002437 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002438 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 we don't have to resize the string.
2441 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002442 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002443 if (!string)
2444 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 kind = PyUnicode_KIND(string);
2446 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002447 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002451 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002452 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002453
2454 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2456 /* checking for == because the last argument could be a empty
2457 string, which causes i to point to end, the assert at the end of
2458 the loop */
2459 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002460
Benjamin Peterson14339b62009-01-31 16:36:08 +00002461 switch (*f) {
2462 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002463 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 const int ordinal = va_arg(vargs, int);
2465 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002466 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002467 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002468 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002470 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002471 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 case 'p':
2473 /* unused, since we already have the result */
2474 if (*f == 'p')
2475 (void) va_arg(vargs, void *);
2476 else
2477 (void) va_arg(vargs, int);
2478 /* extract the result from numberresults and append. */
2479 for (; *numberresult; ++i, ++numberresult)
2480 PyUnicode_WRITE(kind, data, i, *numberresult);
2481 /* skip over the separating '\0' */
2482 assert(*numberresult == '\0');
2483 numberresult++;
2484 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002485 break;
2486 case 's':
2487 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002488 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002490 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 size = PyUnicode_GET_LENGTH(*callresult);
2492 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002493 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002495 /* We're done with the unicode()/repr() => forget it */
2496 Py_DECREF(*callresult);
2497 /* switch to next unicode()/repr() result */
2498 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 break;
2500 }
2501 case 'U':
2502 {
2503 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 Py_ssize_t size;
2505 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2506 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002507 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002509 break;
2510 }
2511 case 'V':
2512 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002514 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002515 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 size = PyUnicode_GET_LENGTH(obj);
2518 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002519 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002521 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 size = PyUnicode_GET_LENGTH(*callresult);
2523 assert(PyUnicode_KIND(*callresult) <=
2524 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002525 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002527 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002528 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002529 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 break;
2531 }
2532 case 'S':
2533 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002534 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002536 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 /* unused, since we already have the result */
2538 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002540 copy_characters(string, i, *callresult, 0, size);
2541 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 /* We're done with the unicode()/repr() => forget it */
2543 Py_DECREF(*callresult);
2544 /* switch to next unicode()/repr() result */
2545 ++callresult;
2546 break;
2547 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002550 break;
2551 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 for (; *p; ++p, ++i)
2553 PyUnicode_WRITE(kind, data, i, *p);
2554 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 goto end;
2556 }
Victor Stinner1205f272010-09-11 00:54:47 +00002557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 else {
2559 assert(i < PyUnicode_GET_LENGTH(string));
2560 PyUnicode_WRITE(kind, data, i++, *f);
2561 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002564
Benjamin Peterson29060642009-01-31 22:14:21 +00002565 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002566 if (callresults)
2567 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 if (numberresults)
2569 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002570 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002572 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002573 if (callresults) {
2574 PyObject **callresult2 = callresults;
2575 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002576 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 ++callresult2;
2578 }
2579 PyObject_Free(callresults);
2580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 if (numberresults)
2582 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002584}
2585
Walter Dörwaldd2034312007-05-18 16:29:38 +00002586PyObject *
2587PyUnicode_FromFormat(const char *format, ...)
2588{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 PyObject* ret;
2590 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002591
2592#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002593 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002594#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002595 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002596#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 ret = PyUnicode_FromFormatV(format, vargs);
2598 va_end(vargs);
2599 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002600}
2601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602#ifdef HAVE_WCHAR_H
2603
Victor Stinner5593d8a2010-10-02 11:11:27 +00002604/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2605 convert a Unicode object to a wide character string.
2606
Victor Stinnerd88d9832011-09-06 02:00:05 +02002607 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002608 character) required to convert the unicode object. Ignore size argument.
2609
Victor Stinnerd88d9832011-09-06 02:00:05 +02002610 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002611 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002612 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002613static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002614unicode_aswidechar(PyUnicodeObject *unicode,
2615 wchar_t *w,
2616 Py_ssize_t size)
2617{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002618 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 const wchar_t *wstr;
2620
2621 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2622 if (wstr == NULL)
2623 return -1;
2624
Victor Stinner5593d8a2010-10-02 11:11:27 +00002625 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002626 if (size > res)
2627 size = res + 1;
2628 else
2629 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002631 return res;
2632 }
2633 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002635}
2636
2637Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002638PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002639 wchar_t *w,
2640 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641{
2642 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002643 PyErr_BadInternalCall();
2644 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002646 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647}
2648
Victor Stinner137c34c2010-09-29 10:25:54 +00002649wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002650PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002651 Py_ssize_t *size)
2652{
2653 wchar_t* buffer;
2654 Py_ssize_t buflen;
2655
2656 if (unicode == NULL) {
2657 PyErr_BadInternalCall();
2658 return NULL;
2659 }
2660
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002661 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002662 if (buflen == -1)
2663 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002664 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002665 PyErr_NoMemory();
2666 return NULL;
2667 }
2668
Victor Stinner137c34c2010-09-29 10:25:54 +00002669 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2670 if (buffer == NULL) {
2671 PyErr_NoMemory();
2672 return NULL;
2673 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002674 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 if (buflen == -1)
2676 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002677 if (size != NULL)
2678 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002679 return buffer;
2680}
2681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
Alexander Belopolsky40018472011-02-26 01:02:56 +00002684PyObject *
2685PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002688 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 PyErr_SetString(PyExc_ValueError,
2690 "chr() arg not in range(0x110000)");
2691 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002692 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 if (ordinal < 256)
2695 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 v = PyUnicode_New(1, ordinal);
2698 if (v == NULL)
2699 return NULL;
2700 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002701 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002703}
2704
Alexander Belopolsky40018472011-02-26 01:02:56 +00002705PyObject *
2706PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002708 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002710 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002711 if (PyUnicode_READY(obj))
2712 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002713 Py_INCREF(obj);
2714 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002715 }
2716 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002717 /* For a Unicode subtype that's not a Unicode object,
2718 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002719 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002720 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002721 PyErr_Format(PyExc_TypeError,
2722 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002723 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002724 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002725}
2726
Alexander Belopolsky40018472011-02-26 01:02:56 +00002727PyObject *
2728PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002729 const char *encoding,
2730 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002731{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002732 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002733 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002734
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002736 PyErr_BadInternalCall();
2737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002739
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002740 /* Decoding bytes objects is the most common case and should be fast */
2741 if (PyBytes_Check(obj)) {
2742 if (PyBytes_GET_SIZE(obj) == 0) {
2743 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002744 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002745 }
2746 else {
2747 v = PyUnicode_Decode(
2748 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2749 encoding, errors);
2750 }
2751 return v;
2752 }
2753
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002754 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 PyErr_SetString(PyExc_TypeError,
2756 "decoding str is not supported");
2757 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002758 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002759
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002760 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2761 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2762 PyErr_Format(PyExc_TypeError,
2763 "coercing to str: need bytes, bytearray "
2764 "or buffer-like object, %.80s found",
2765 Py_TYPE(obj)->tp_name);
2766 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002767 }
Tim Petersced69f82003-09-16 20:30:58 +00002768
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002769 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002771 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 }
Tim Petersced69f82003-09-16 20:30:58 +00002773 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002774 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002775
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002776 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002777 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778}
2779
Victor Stinner600d3be2010-06-10 12:00:55 +00002780/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002781 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2782 1 on success. */
2783static int
2784normalize_encoding(const char *encoding,
2785 char *lower,
2786 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002788 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002789 char *l;
2790 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002792 e = encoding;
2793 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002794 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002795 while (*e) {
2796 if (l == l_end)
2797 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002798 if (Py_ISUPPER(*e)) {
2799 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002800 }
2801 else if (*e == '_') {
2802 *l++ = '-';
2803 e++;
2804 }
2805 else {
2806 *l++ = *e++;
2807 }
2808 }
2809 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002810 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002811}
2812
Alexander Belopolsky40018472011-02-26 01:02:56 +00002813PyObject *
2814PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002815 Py_ssize_t size,
2816 const char *encoding,
2817 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002818{
2819 PyObject *buffer = NULL, *unicode;
2820 Py_buffer info;
2821 char lower[11]; /* Enough for any encoding shortcut */
2822
2823 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002824 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002825
2826 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002827 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002828 if ((strcmp(lower, "utf-8") == 0) ||
2829 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002830 return PyUnicode_DecodeUTF8(s, size, errors);
2831 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002832 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002833 (strcmp(lower, "iso-8859-1") == 0))
2834 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002835#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002836 else if (strcmp(lower, "mbcs") == 0)
2837 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002838#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002839 else if (strcmp(lower, "ascii") == 0)
2840 return PyUnicode_DecodeASCII(s, size, errors);
2841 else if (strcmp(lower, "utf-16") == 0)
2842 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2843 else if (strcmp(lower, "utf-32") == 0)
2844 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846
2847 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002848 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002849 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002850 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002851 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 if (buffer == NULL)
2853 goto onError;
2854 unicode = PyCodec_Decode(buffer, encoding, errors);
2855 if (unicode == NULL)
2856 goto onError;
2857 if (!PyUnicode_Check(unicode)) {
2858 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002859 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002860 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 Py_DECREF(unicode);
2862 goto onError;
2863 }
2864 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002865#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002866 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002867 Py_DECREF(unicode);
2868 return NULL;
2869 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002870#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002871 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002873
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 Py_XDECREF(buffer);
2876 return NULL;
2877}
2878
Alexander Belopolsky40018472011-02-26 01:02:56 +00002879PyObject *
2880PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002881 const char *encoding,
2882 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002883{
2884 PyObject *v;
2885
2886 if (!PyUnicode_Check(unicode)) {
2887 PyErr_BadArgument();
2888 goto onError;
2889 }
2890
2891 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002893
2894 /* Decode via the codec registry */
2895 v = PyCodec_Decode(unicode, encoding, errors);
2896 if (v == NULL)
2897 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002898 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002899 return v;
2900
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002902 return NULL;
2903}
2904
Alexander Belopolsky40018472011-02-26 01:02:56 +00002905PyObject *
2906PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002907 const char *encoding,
2908 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002909{
2910 PyObject *v;
2911
2912 if (!PyUnicode_Check(unicode)) {
2913 PyErr_BadArgument();
2914 goto onError;
2915 }
2916
2917 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002919
2920 /* Decode via the codec registry */
2921 v = PyCodec_Decode(unicode, encoding, errors);
2922 if (v == NULL)
2923 goto onError;
2924 if (!PyUnicode_Check(v)) {
2925 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002926 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002927 Py_TYPE(v)->tp_name);
2928 Py_DECREF(v);
2929 goto onError;
2930 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002931 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002932 return v;
2933
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002935 return NULL;
2936}
2937
Alexander Belopolsky40018472011-02-26 01:02:56 +00002938PyObject *
2939PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002940 Py_ssize_t size,
2941 const char *encoding,
2942 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943{
2944 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002945
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 unicode = PyUnicode_FromUnicode(s, size);
2947 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2950 Py_DECREF(unicode);
2951 return v;
2952}
2953
Alexander Belopolsky40018472011-02-26 01:02:56 +00002954PyObject *
2955PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002956 const char *encoding,
2957 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002958{
2959 PyObject *v;
2960
2961 if (!PyUnicode_Check(unicode)) {
2962 PyErr_BadArgument();
2963 goto onError;
2964 }
2965
2966 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002968
2969 /* Encode via the codec registry */
2970 v = PyCodec_Encode(unicode, encoding, errors);
2971 if (v == NULL)
2972 goto onError;
2973 return v;
2974
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976 return NULL;
2977}
2978
Victor Stinnerad158722010-10-27 00:25:46 +00002979PyObject *
2980PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002981{
Victor Stinner99b95382011-07-04 14:23:54 +02002982#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002983 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2984 PyUnicode_GET_SIZE(unicode),
2985 NULL);
2986#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002987 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002988#else
Victor Stinner793b5312011-04-27 00:24:21 +02002989 PyInterpreterState *interp = PyThreadState_GET()->interp;
2990 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2991 cannot use it to encode and decode filenames before it is loaded. Load
2992 the Python codec requires to encode at least its own filename. Use the C
2993 version of the locale codec until the codec registry is initialized and
2994 the Python codec is loaded.
2995
2996 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2997 cannot only rely on it: check also interp->fscodec_initialized for
2998 subinterpreters. */
2999 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003000 return PyUnicode_AsEncodedString(unicode,
3001 Py_FileSystemDefaultEncoding,
3002 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003003 }
3004 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003005 /* locale encoding with surrogateescape */
3006 wchar_t *wchar;
3007 char *bytes;
3008 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003009 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003010
3011 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3012 if (wchar == NULL)
3013 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003014 bytes = _Py_wchar2char(wchar, &error_pos);
3015 if (bytes == NULL) {
3016 if (error_pos != (size_t)-1) {
3017 char *errmsg = strerror(errno);
3018 PyObject *exc = NULL;
3019 if (errmsg == NULL)
3020 errmsg = "Py_wchar2char() failed";
3021 raise_encode_exception(&exc,
3022 "filesystemencoding",
3023 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3024 error_pos, error_pos+1,
3025 errmsg);
3026 Py_XDECREF(exc);
3027 }
3028 else
3029 PyErr_NoMemory();
3030 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003031 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003032 }
3033 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003034
3035 bytes_obj = PyBytes_FromString(bytes);
3036 PyMem_Free(bytes);
3037 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003038 }
Victor Stinnerad158722010-10-27 00:25:46 +00003039#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003040}
3041
Alexander Belopolsky40018472011-02-26 01:02:56 +00003042PyObject *
3043PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003044 const char *encoding,
3045 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046{
3047 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003048 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003049
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 if (!PyUnicode_Check(unicode)) {
3051 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 }
Fred Drakee4315f52000-05-09 19:53:39 +00003054
Victor Stinner2f283c22011-03-02 01:21:46 +00003055 if (encoding == NULL) {
3056 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003057 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003058 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003060 }
Fred Drakee4315f52000-05-09 19:53:39 +00003061
3062 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003063 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003064 if ((strcmp(lower, "utf-8") == 0) ||
3065 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003066 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003067 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003068 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003069 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003070 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003071 }
Victor Stinner37296e82010-06-10 13:36:23 +00003072 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003073 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003074 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003075 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003076#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003077 else if (strcmp(lower, "mbcs") == 0)
3078 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3079 PyUnicode_GET_SIZE(unicode),
3080 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003081#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003082 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003083 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085
3086 /* Encode via the codec registry */
3087 v = PyCodec_Encode(unicode, encoding, errors);
3088 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003089 return NULL;
3090
3091 /* The normal path */
3092 if (PyBytes_Check(v))
3093 return v;
3094
3095 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003096 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003097 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003098 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003099
3100 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3101 "encoder %s returned bytearray instead of bytes",
3102 encoding);
3103 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003104 Py_DECREF(v);
3105 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003107
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003108 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3109 Py_DECREF(v);
3110 return b;
3111 }
3112
3113 PyErr_Format(PyExc_TypeError,
3114 "encoder did not return a bytes object (type=%.400s)",
3115 Py_TYPE(v)->tp_name);
3116 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003117 return NULL;
3118}
3119
Alexander Belopolsky40018472011-02-26 01:02:56 +00003120PyObject *
3121PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003122 const char *encoding,
3123 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003124{
3125 PyObject *v;
3126
3127 if (!PyUnicode_Check(unicode)) {
3128 PyErr_BadArgument();
3129 goto onError;
3130 }
3131
3132 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003133 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003134
3135 /* Encode via the codec registry */
3136 v = PyCodec_Encode(unicode, encoding, errors);
3137 if (v == NULL)
3138 goto onError;
3139 if (!PyUnicode_Check(v)) {
3140 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003141 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003142 Py_TYPE(v)->tp_name);
3143 Py_DECREF(v);
3144 goto onError;
3145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003147
Benjamin Peterson29060642009-01-31 22:14:21 +00003148 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 return NULL;
3150}
3151
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003152PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003153PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003154 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003155 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3156}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003157
Christian Heimes5894ba72007-11-04 11:43:14 +00003158PyObject*
3159PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3160{
Victor Stinner99b95382011-07-04 14:23:54 +02003161#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003162 return PyUnicode_DecodeMBCS(s, size, NULL);
3163#elif defined(__APPLE__)
3164 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3165#else
Victor Stinner793b5312011-04-27 00:24:21 +02003166 PyInterpreterState *interp = PyThreadState_GET()->interp;
3167 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3168 cannot use it to encode and decode filenames before it is loaded. Load
3169 the Python codec requires to encode at least its own filename. Use the C
3170 version of the locale codec until the codec registry is initialized and
3171 the Python codec is loaded.
3172
3173 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3174 cannot only rely on it: check also interp->fscodec_initialized for
3175 subinterpreters. */
3176 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003177 return PyUnicode_Decode(s, size,
3178 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003179 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003180 }
3181 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003182 /* locale encoding with surrogateescape */
3183 wchar_t *wchar;
3184 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003185 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003186
3187 if (s[size] != '\0' || size != strlen(s)) {
3188 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3189 return NULL;
3190 }
3191
Victor Stinner168e1172010-10-16 23:16:16 +00003192 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003193 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003194 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003195
Victor Stinner168e1172010-10-16 23:16:16 +00003196 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003197 PyMem_Free(wchar);
3198 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003199 }
Victor Stinnerad158722010-10-27 00:25:46 +00003200#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003201}
3202
Martin v. Löwis011e8422009-05-05 04:43:17 +00003203
3204int
3205PyUnicode_FSConverter(PyObject* arg, void* addr)
3206{
3207 PyObject *output = NULL;
3208 Py_ssize_t size;
3209 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003210 if (arg == NULL) {
3211 Py_DECREF(*(PyObject**)addr);
3212 return 1;
3213 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003214 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003215 output = arg;
3216 Py_INCREF(output);
3217 }
3218 else {
3219 arg = PyUnicode_FromObject(arg);
3220 if (!arg)
3221 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003222 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003223 Py_DECREF(arg);
3224 if (!output)
3225 return 0;
3226 if (!PyBytes_Check(output)) {
3227 Py_DECREF(output);
3228 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3229 return 0;
3230 }
3231 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003232 size = PyBytes_GET_SIZE(output);
3233 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003234 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003235 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003236 Py_DECREF(output);
3237 return 0;
3238 }
3239 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003240 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003241}
3242
3243
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003244int
3245PyUnicode_FSDecoder(PyObject* arg, void* addr)
3246{
3247 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003248 if (arg == NULL) {
3249 Py_DECREF(*(PyObject**)addr);
3250 return 1;
3251 }
3252 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003253 if (PyUnicode_READY(arg))
3254 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003255 output = arg;
3256 Py_INCREF(output);
3257 }
3258 else {
3259 arg = PyBytes_FromObject(arg);
3260 if (!arg)
3261 return 0;
3262 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3263 PyBytes_GET_SIZE(arg));
3264 Py_DECREF(arg);
3265 if (!output)
3266 return 0;
3267 if (!PyUnicode_Check(output)) {
3268 Py_DECREF(output);
3269 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3270 return 0;
3271 }
3272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003273 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3274 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003275 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3276 Py_DECREF(output);
3277 return 0;
3278 }
3279 *(PyObject**)addr = output;
3280 return Py_CLEANUP_SUPPORTED;
3281}
3282
3283
Martin v. Löwis5b222132007-06-10 09:51:05 +00003284char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003285PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003286{
Christian Heimesf3863112007-11-22 07:46:41 +00003287 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003288 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3289
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003290 if (!PyUnicode_Check(unicode)) {
3291 PyErr_BadArgument();
3292 return NULL;
3293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003294 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003295 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003296
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003297 if (PyUnicode_UTF8(unicode) == NULL) {
3298 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003299 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3300 if (bytes == NULL)
3301 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003302 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3303 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003304 Py_DECREF(bytes);
3305 return NULL;
3306 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003307 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3308 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003309 Py_DECREF(bytes);
3310 }
3311
3312 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003313 *psize = PyUnicode_UTF8_LENGTH(unicode);
3314 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003315}
3316
3317char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003318PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003320 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3321}
3322
3323#ifdef Py_DEBUG
3324int unicode_as_unicode_calls = 0;
3325#endif
3326
3327
3328Py_UNICODE *
3329PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3330{
3331 PyUnicodeObject *u;
3332 const unsigned char *one_byte;
3333#if SIZEOF_WCHAR_T == 4
3334 const Py_UCS2 *two_bytes;
3335#else
3336 const Py_UCS4 *four_bytes;
3337 const Py_UCS4 *ucs4_end;
3338 Py_ssize_t num_surrogates;
3339#endif
3340 wchar_t *w;
3341 wchar_t *wchar_end;
3342
3343 if (!PyUnicode_Check(unicode)) {
3344 PyErr_BadArgument();
3345 return NULL;
3346 }
3347 u = (PyUnicodeObject*)unicode;
3348 if (_PyUnicode_WSTR(u) == NULL) {
3349 /* Non-ASCII compact unicode object */
3350 assert(_PyUnicode_KIND(u) != 0);
3351 assert(PyUnicode_IS_READY(u));
3352
3353#ifdef Py_DEBUG
3354 ++unicode_as_unicode_calls;
3355#endif
3356
3357 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3358#if SIZEOF_WCHAR_T == 2
3359 four_bytes = PyUnicode_4BYTE_DATA(u);
3360 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3361 num_surrogates = 0;
3362
3363 for (; four_bytes < ucs4_end; ++four_bytes) {
3364 if (*four_bytes > 0xFFFF)
3365 ++num_surrogates;
3366 }
3367
3368 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3369 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3370 if (!_PyUnicode_WSTR(u)) {
3371 PyErr_NoMemory();
3372 return NULL;
3373 }
3374 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3375
3376 w = _PyUnicode_WSTR(u);
3377 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3378 four_bytes = PyUnicode_4BYTE_DATA(u);
3379 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3380 if (*four_bytes > 0xFFFF) {
3381 /* encode surrogate pair in this case */
3382 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3383 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3384 }
3385 else
3386 *w = *four_bytes;
3387
3388 if (w > wchar_end) {
3389 assert(0 && "Miscalculated string end");
3390 }
3391 }
3392 *w = 0;
3393#else
3394 /* sizeof(wchar_t) == 4 */
3395 Py_FatalError("Impossible unicode object state, wstr and str "
3396 "should share memory already.");
3397 return NULL;
3398#endif
3399 }
3400 else {
3401 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3402 (_PyUnicode_LENGTH(u) + 1));
3403 if (!_PyUnicode_WSTR(u)) {
3404 PyErr_NoMemory();
3405 return NULL;
3406 }
3407 if (!PyUnicode_IS_COMPACT_ASCII(u))
3408 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3409 w = _PyUnicode_WSTR(u);
3410 wchar_end = w + _PyUnicode_LENGTH(u);
3411
3412 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3413 one_byte = PyUnicode_1BYTE_DATA(u);
3414 for (; w < wchar_end; ++one_byte, ++w)
3415 *w = *one_byte;
3416 /* null-terminate the wstr */
3417 *w = 0;
3418 }
3419 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3420#if SIZEOF_WCHAR_T == 4
3421 two_bytes = PyUnicode_2BYTE_DATA(u);
3422 for (; w < wchar_end; ++two_bytes, ++w)
3423 *w = *two_bytes;
3424 /* null-terminate the wstr */
3425 *w = 0;
3426#else
3427 /* sizeof(wchar_t) == 2 */
3428 PyObject_FREE(_PyUnicode_WSTR(u));
3429 _PyUnicode_WSTR(u) = NULL;
3430 Py_FatalError("Impossible unicode object state, wstr "
3431 "and str should share memory already.");
3432 return NULL;
3433#endif
3434 }
3435 else {
3436 assert(0 && "This should never happen.");
3437 }
3438 }
3439 }
3440 if (size != NULL)
3441 *size = PyUnicode_WSTR_LENGTH(u);
3442 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003443}
3444
Alexander Belopolsky40018472011-02-26 01:02:56 +00003445Py_UNICODE *
3446PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449}
3450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451
Alexander Belopolsky40018472011-02-26 01:02:56 +00003452Py_ssize_t
3453PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454{
3455 if (!PyUnicode_Check(unicode)) {
3456 PyErr_BadArgument();
3457 goto onError;
3458 }
3459 return PyUnicode_GET_SIZE(unicode);
3460
Benjamin Peterson29060642009-01-31 22:14:21 +00003461 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 return -1;
3463}
3464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465Py_ssize_t
3466PyUnicode_GetLength(PyObject *unicode)
3467{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003468 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469 PyErr_BadArgument();
3470 return -1;
3471 }
3472
3473 return PyUnicode_GET_LENGTH(unicode);
3474}
3475
3476Py_UCS4
3477PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3478{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003479 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3480 PyErr_BadArgument();
3481 return (Py_UCS4)-1;
3482 }
3483 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3484 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003485 return (Py_UCS4)-1;
3486 }
3487 return PyUnicode_READ_CHAR(unicode, index);
3488}
3489
3490int
3491PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3492{
3493 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003494 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003495 return -1;
3496 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003497 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3498 PyErr_SetString(PyExc_IndexError, "string index out of range");
3499 return -1;
3500 }
3501 if (_PyUnicode_Dirty(unicode))
3502 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003503 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3504 index, ch);
3505 return 0;
3506}
3507
Alexander Belopolsky40018472011-02-26 01:02:56 +00003508const char *
3509PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003510{
Victor Stinner42cb4622010-09-01 19:39:01 +00003511 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003512}
3513
Victor Stinner554f3f02010-06-16 23:33:54 +00003514/* create or adjust a UnicodeDecodeError */
3515static void
3516make_decode_exception(PyObject **exceptionObject,
3517 const char *encoding,
3518 const char *input, Py_ssize_t length,
3519 Py_ssize_t startpos, Py_ssize_t endpos,
3520 const char *reason)
3521{
3522 if (*exceptionObject == NULL) {
3523 *exceptionObject = PyUnicodeDecodeError_Create(
3524 encoding, input, length, startpos, endpos, reason);
3525 }
3526 else {
3527 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3528 goto onError;
3529 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3530 goto onError;
3531 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3532 goto onError;
3533 }
3534 return;
3535
3536onError:
3537 Py_DECREF(*exceptionObject);
3538 *exceptionObject = NULL;
3539}
3540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541/* error handling callback helper:
3542 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003543 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 and adjust various state variables.
3545 return 0 on success, -1 on error
3546*/
3547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548static int
3549unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003550 const char *encoding, const char *reason,
3551 const char **input, const char **inend, Py_ssize_t *startinpos,
3552 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3553 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003555 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556
3557 PyObject *restuple = NULL;
3558 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003559 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003560 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003561 Py_ssize_t requiredsize;
3562 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003564 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 int res = -1;
3567
3568 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 *errorHandler = PyCodec_LookupError(errors);
3570 if (*errorHandler == NULL)
3571 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 }
3573
Victor Stinner554f3f02010-06-16 23:33:54 +00003574 make_decode_exception(exceptionObject,
3575 encoding,
3576 *input, *inend - *input,
3577 *startinpos, *endinpos,
3578 reason);
3579 if (*exceptionObject == NULL)
3580 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581
3582 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3583 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003586 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 }
3589 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003591
3592 /* Copy back the bytes variables, which might have been modified by the
3593 callback */
3594 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3595 if (!inputobj)
3596 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003597 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003599 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003600 *input = PyBytes_AS_STRING(inputobj);
3601 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003602 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003603 /* we can DECREF safely, as the exception has another reference,
3604 so the object won't go away. */
3605 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003608 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003609 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3611 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003612 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613
3614 /* need more space? (at least enough for what we
3615 have+the replacement+the rest of the string (starting
3616 at the new input position), so we won't have to check space
3617 when there are no errors in the rest of the string) */
3618 repptr = PyUnicode_AS_UNICODE(repunicode);
3619 repsize = PyUnicode_GET_SIZE(repunicode);
3620 requiredsize = *outpos + repsize + insize-newpos;
3621 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 if (requiredsize<2*outsize)
3623 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003624 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003625 goto onError;
3626 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 }
3628 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003629 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630 Py_UNICODE_COPY(*outptr, repptr, repsize);
3631 *outptr += repsize;
3632 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 /* we made it! */
3635 res = 0;
3636
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 Py_XDECREF(restuple);
3639 return res;
3640}
3641
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003642/* --- UTF-7 Codec -------------------------------------------------------- */
3643
Antoine Pitrou244651a2009-05-04 18:56:13 +00003644/* See RFC2152 for details. We encode conservatively and decode liberally. */
3645
3646/* Three simple macros defining base-64. */
3647
3648/* Is c a base-64 character? */
3649
3650#define IS_BASE64(c) \
3651 (((c) >= 'A' && (c) <= 'Z') || \
3652 ((c) >= 'a' && (c) <= 'z') || \
3653 ((c) >= '0' && (c) <= '9') || \
3654 (c) == '+' || (c) == '/')
3655
3656/* given that c is a base-64 character, what is its base-64 value? */
3657
3658#define FROM_BASE64(c) \
3659 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3660 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3661 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3662 (c) == '+' ? 62 : 63)
3663
3664/* What is the base-64 character of the bottom 6 bits of n? */
3665
3666#define TO_BASE64(n) \
3667 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3668
3669/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3670 * decoded as itself. We are permissive on decoding; the only ASCII
3671 * byte not decoding to itself is the + which begins a base64
3672 * string. */
3673
3674#define DECODE_DIRECT(c) \
3675 ((c) <= 127 && (c) != '+')
3676
3677/* The UTF-7 encoder treats ASCII characters differently according to
3678 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3679 * the above). See RFC2152. This array identifies these different
3680 * sets:
3681 * 0 : "Set D"
3682 * alphanumeric and '(),-./:?
3683 * 1 : "Set O"
3684 * !"#$%&*;<=>@[]^_`{|}
3685 * 2 : "whitespace"
3686 * ht nl cr sp
3687 * 3 : special (must be base64 encoded)
3688 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3689 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003690
Tim Petersced69f82003-09-16 20:30:58 +00003691static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003692char utf7_category[128] = {
3693/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3694 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3695/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3696 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3697/* sp ! " # $ % & ' ( ) * + , - . / */
3698 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3699/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3701/* @ A B C D E F G H I J K L M N O */
3702 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3703/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3705/* ` a b c d e f g h i j k l m n o */
3706 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3707/* p q r s t u v w x y z { | } ~ del */
3708 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003709};
3710
Antoine Pitrou244651a2009-05-04 18:56:13 +00003711/* ENCODE_DIRECT: this character should be encoded as itself. The
3712 * answer depends on whether we are encoding set O as itself, and also
3713 * on whether we are encoding whitespace as itself. RFC2152 makes it
3714 * clear that the answers to these questions vary between
3715 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003716
Antoine Pitrou244651a2009-05-04 18:56:13 +00003717#define ENCODE_DIRECT(c, directO, directWS) \
3718 ((c) < 128 && (c) > 0 && \
3719 ((utf7_category[(c)] == 0) || \
3720 (directWS && (utf7_category[(c)] == 2)) || \
3721 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003722
Alexander Belopolsky40018472011-02-26 01:02:56 +00003723PyObject *
3724PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003725 Py_ssize_t size,
3726 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003727{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003728 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3729}
3730
Antoine Pitrou244651a2009-05-04 18:56:13 +00003731/* The decoder. The only state we preserve is our read position,
3732 * i.e. how many characters we have consumed. So if we end in the
3733 * middle of a shift sequence we have to back off the read position
3734 * and the output to the beginning of the sequence, otherwise we lose
3735 * all the shift state (seen bits, number of bits seen, high
3736 * surrogate). */
3737
Alexander Belopolsky40018472011-02-26 01:02:56 +00003738PyObject *
3739PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003740 Py_ssize_t size,
3741 const char *errors,
3742 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t startinpos;
3746 Py_ssize_t endinpos;
3747 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003748 const char *e;
3749 PyUnicodeObject *unicode;
3750 Py_UNICODE *p;
3751 const char *errmsg = "";
3752 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003753 Py_UNICODE *shiftOutStart;
3754 unsigned int base64bits = 0;
3755 unsigned long base64buffer = 0;
3756 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 PyObject *errorHandler = NULL;
3758 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003759
3760 unicode = _PyUnicode_New(size);
3761 if (!unicode)
3762 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003763 if (size == 0) {
3764 if (consumed)
3765 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003766 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003767 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003770 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003771 e = s + size;
3772
3773 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003775 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003776 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003777
Antoine Pitrou244651a2009-05-04 18:56:13 +00003778 if (inShift) { /* in a base-64 section */
3779 if (IS_BASE64(ch)) { /* consume a base-64 character */
3780 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3781 base64bits += 6;
3782 s++;
3783 if (base64bits >= 16) {
3784 /* we have enough bits for a UTF-16 value */
3785 Py_UNICODE outCh = (Py_UNICODE)
3786 (base64buffer >> (base64bits-16));
3787 base64bits -= 16;
3788 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3789 if (surrogate) {
3790 /* expecting a second surrogate */
3791 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3792#ifdef Py_UNICODE_WIDE
3793 *p++ = (((surrogate & 0x3FF)<<10)
3794 | (outCh & 0x3FF)) + 0x10000;
3795#else
3796 *p++ = surrogate;
3797 *p++ = outCh;
3798#endif
3799 surrogate = 0;
3800 }
3801 else {
3802 surrogate = 0;
3803 errmsg = "second surrogate missing";
3804 goto utf7Error;
3805 }
3806 }
3807 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3808 /* first surrogate */
3809 surrogate = outCh;
3810 }
3811 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3812 errmsg = "unexpected second surrogate";
3813 goto utf7Error;
3814 }
3815 else {
3816 *p++ = outCh;
3817 }
3818 }
3819 }
3820 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003821 inShift = 0;
3822 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003823 if (surrogate) {
3824 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003825 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003826 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003827 if (base64bits > 0) { /* left-over bits */
3828 if (base64bits >= 6) {
3829 /* We've seen at least one base-64 character */
3830 errmsg = "partial character in shift sequence";
3831 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003832 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003833 else {
3834 /* Some bits remain; they should be zero */
3835 if (base64buffer != 0) {
3836 errmsg = "non-zero padding bits in shift sequence";
3837 goto utf7Error;
3838 }
3839 }
3840 }
3841 if (ch != '-') {
3842 /* '-' is absorbed; other terminating
3843 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003844 *p++ = ch;
3845 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003846 }
3847 }
3848 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003850 s++; /* consume '+' */
3851 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852 s++;
3853 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003854 }
3855 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003856 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003857 shiftOutStart = p;
3858 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003859 }
3860 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003861 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003862 *p++ = ch;
3863 s++;
3864 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003865 else {
3866 startinpos = s-starts;
3867 s++;
3868 errmsg = "unexpected special character";
3869 goto utf7Error;
3870 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003871 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003872utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 outpos = p-PyUnicode_AS_UNICODE(unicode);
3874 endinpos = s-starts;
3875 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003876 errors, &errorHandler,
3877 "utf7", errmsg,
3878 &starts, &e, &startinpos, &endinpos, &exc, &s,
3879 &unicode, &outpos, &p))
3880 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003881 }
3882
Antoine Pitrou244651a2009-05-04 18:56:13 +00003883 /* end of string */
3884
3885 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3886 /* if we're in an inconsistent state, that's an error */
3887 if (surrogate ||
3888 (base64bits >= 6) ||
3889 (base64bits > 0 && base64buffer != 0)) {
3890 outpos = p-PyUnicode_AS_UNICODE(unicode);
3891 endinpos = size;
3892 if (unicode_decode_call_errorhandler(
3893 errors, &errorHandler,
3894 "utf7", "unterminated shift sequence",
3895 &starts, &e, &startinpos, &endinpos, &exc, &s,
3896 &unicode, &outpos, &p))
3897 goto onError;
3898 if (s < e)
3899 goto restart;
3900 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003901 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902
3903 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003904 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003905 if (inShift) {
3906 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003907 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003908 }
3909 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003910 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003911 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003912 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003913
Victor Stinnerfe226c02011-10-03 03:52:20 +02003914 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003915 goto onError;
3916
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 Py_XDECREF(errorHandler);
3918 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003919#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003920 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003921 Py_DECREF(unicode);
3922 return NULL;
3923 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003924#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003925 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003926 return (PyObject *)unicode;
3927
Benjamin Peterson29060642009-01-31 22:14:21 +00003928 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 Py_XDECREF(errorHandler);
3930 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003931 Py_DECREF(unicode);
3932 return NULL;
3933}
3934
3935
Alexander Belopolsky40018472011-02-26 01:02:56 +00003936PyObject *
3937PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003938 Py_ssize_t size,
3939 int base64SetO,
3940 int base64WhiteSpace,
3941 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003942{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003943 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003945 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003947 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 unsigned int base64bits = 0;
3949 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 char * out;
3951 char * start;
3952
3953 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003954 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003955
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003956 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003957 return PyErr_NoMemory();
3958
Antoine Pitrou244651a2009-05-04 18:56:13 +00003959 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003960 if (v == NULL)
3961 return NULL;
3962
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003963 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003964 for (;i < size; ++i) {
3965 Py_UNICODE ch = s[i];
3966
Antoine Pitrou244651a2009-05-04 18:56:13 +00003967 if (inShift) {
3968 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3969 /* shifting out */
3970 if (base64bits) { /* output remaining bits */
3971 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3972 base64buffer = 0;
3973 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 }
3975 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003976 /* Characters not in the BASE64 set implicitly unshift the sequence
3977 so no '-' is required, except if the character is itself a '-' */
3978 if (IS_BASE64(ch) || ch == '-') {
3979 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003981 *out++ = (char) ch;
3982 }
3983 else {
3984 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003985 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003986 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 else { /* not in a shift sequence */
3988 if (ch == '+') {
3989 *out++ = '+';
3990 *out++ = '-';
3991 }
3992 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3993 *out++ = (char) ch;
3994 }
3995 else {
3996 *out++ = '+';
3997 inShift = 1;
3998 goto encode_char;
3999 }
4000 }
4001 continue;
4002encode_char:
4003#ifdef Py_UNICODE_WIDE
4004 if (ch >= 0x10000) {
4005 /* code first surrogate */
4006 base64bits += 16;
4007 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4008 while (base64bits >= 6) {
4009 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4010 base64bits -= 6;
4011 }
4012 /* prepare second surrogate */
4013 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4014 }
4015#endif
4016 base64bits += 16;
4017 base64buffer = (base64buffer << 16) | ch;
4018 while (base64bits >= 6) {
4019 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4020 base64bits -= 6;
4021 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004022 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023 if (base64bits)
4024 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4025 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004026 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004027 if (_PyBytes_Resize(&v, out - start) < 0)
4028 return NULL;
4029 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030}
4031
Antoine Pitrou244651a2009-05-04 18:56:13 +00004032#undef IS_BASE64
4033#undef FROM_BASE64
4034#undef TO_BASE64
4035#undef DECODE_DIRECT
4036#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004037
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038/* --- UTF-8 Codec -------------------------------------------------------- */
4039
Tim Petersced69f82003-09-16 20:30:58 +00004040static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004042 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4043 illegal prefix. See RFC 3629 for details */
4044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4048 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4049 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4050 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004051 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4056 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4057 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4058 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4059 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060};
4061
Alexander Belopolsky40018472011-02-26 01:02:56 +00004062PyObject *
4063PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004064 Py_ssize_t size,
4065 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066{
Walter Dörwald69652032004-09-07 20:24:22 +00004067 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4068}
4069
Antoine Pitrouab868312009-01-10 15:40:25 +00004070/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4071#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4072
4073/* Mask to quickly check whether a C 'long' contains a
4074 non-ASCII, UTF8-encoded char. */
4075#if (SIZEOF_LONG == 8)
4076# define ASCII_CHAR_MASK 0x8080808080808080L
4077#elif (SIZEOF_LONG == 4)
4078# define ASCII_CHAR_MASK 0x80808080L
4079#else
4080# error C 'long' size should be either 4 or 8!
4081#endif
4082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083/* Scans a UTF-8 string and returns the maximum character to be expected,
4084 the size of the decoded unicode string and if any major errors were
4085 encountered.
4086
4087 This function does check basic UTF-8 sanity, it does however NOT CHECK
4088 if the string contains surrogates, and if all continuation bytes are
4089 within the correct ranges, these checks are performed in
4090 PyUnicode_DecodeUTF8Stateful.
4091
4092 If it sets has_errors to 1, it means the value of unicode_size and max_char
4093 will be bogus and you should not rely on useful information in them.
4094 */
4095static Py_UCS4
4096utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4097 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4098 int *has_errors)
4099{
4100 Py_ssize_t n;
4101 Py_ssize_t char_count = 0;
4102 Py_UCS4 max_char = 127, new_max;
4103 Py_UCS4 upper_bound;
4104 const unsigned char *p = (const unsigned char *)s;
4105 const unsigned char *end = p + string_size;
4106 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4107 int err = 0;
4108
4109 for (; p < end && !err; ++p, ++char_count) {
4110 /* Only check value if it's not a ASCII char... */
4111 if (*p < 0x80) {
4112 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4113 an explanation. */
4114 if (!((size_t) p & LONG_PTR_MASK)) {
4115 /* Help register allocation */
4116 register const unsigned char *_p = p;
4117 while (_p < aligned_end) {
4118 unsigned long value = *(unsigned long *) _p;
4119 if (value & ASCII_CHAR_MASK)
4120 break;
4121 _p += SIZEOF_LONG;
4122 char_count += SIZEOF_LONG;
4123 }
4124 p = _p;
4125 if (p == end)
4126 break;
4127 }
4128 }
4129 if (*p >= 0x80) {
4130 n = utf8_code_length[*p];
4131 new_max = max_char;
4132 switch (n) {
4133 /* invalid start byte */
4134 case 0:
4135 err = 1;
4136 break;
4137 case 2:
4138 /* Code points between 0x00FF and 0x07FF inclusive.
4139 Approximate the upper bound of the code point,
4140 if this flips over 255 we can be sure it will be more
4141 than 255 and the string will need 2 bytes per code coint,
4142 if it stays under or equal to 255, we can be sure 1 byte
4143 is enough.
4144 ((*p & 0b00011111) << 6) | 0b00111111 */
4145 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4146 if (max_char < upper_bound)
4147 new_max = upper_bound;
4148 /* Ensure we track at least that we left ASCII space. */
4149 if (new_max < 128)
4150 new_max = 128;
4151 break;
4152 case 3:
4153 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4154 always > 255 and <= 65535 and will always need 2 bytes. */
4155 if (max_char < 65535)
4156 new_max = 65535;
4157 break;
4158 case 4:
4159 /* Code point will be above 0xFFFF for sure in this case. */
4160 new_max = 65537;
4161 break;
4162 /* Internal error, this should be caught by the first if */
4163 case 1:
4164 default:
4165 assert(0 && "Impossible case in utf8_max_char_and_size");
4166 err = 1;
4167 }
4168 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004169 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170 --n;
4171 /* Check if the follow up chars are all valid continuation bytes */
4172 if (n >= 1) {
4173 const unsigned char *cont;
4174 if ((p + n) >= end) {
4175 if (consumed == 0)
4176 /* incomplete data, non-incremental decoding */
4177 err = 1;
4178 break;
4179 }
4180 for (cont = p + 1; cont < (p + n); ++cont) {
4181 if ((*cont & 0xc0) != 0x80) {
4182 err = 1;
4183 break;
4184 }
4185 }
4186 p += n;
4187 }
4188 else
4189 err = 1;
4190 max_char = new_max;
4191 }
4192 }
4193
4194 if (unicode_size)
4195 *unicode_size = char_count;
4196 if (has_errors)
4197 *has_errors = err;
4198 return max_char;
4199}
4200
4201/* Similar to PyUnicode_WRITE but can also write into wstr field
4202 of the legacy unicode representation */
4203#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4204 do { \
4205 const int k_ = (kind); \
4206 if (k_ == PyUnicode_WCHAR_KIND) \
4207 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4208 else if (k_ == PyUnicode_1BYTE_KIND) \
4209 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4210 else if (k_ == PyUnicode_2BYTE_KIND) \
4211 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4212 else \
4213 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4214 } while (0)
4215
Alexander Belopolsky40018472011-02-26 01:02:56 +00004216PyObject *
4217PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218 Py_ssize_t size,
4219 const char *errors,
4220 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004221{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004224 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004225 Py_ssize_t startinpos;
4226 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004227 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004229 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 PyObject *errorHandler = NULL;
4231 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 Py_UCS4 maxchar = 0;
4233 Py_ssize_t unicode_size;
4234 Py_ssize_t i;
4235 int kind;
4236 void *data;
4237 int has_errors;
4238 Py_UNICODE *error_outptr;
4239#if SIZEOF_WCHAR_T == 2
4240 Py_ssize_t wchar_offset = 0;
4241#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242
Walter Dörwald69652032004-09-07 20:24:22 +00004243 if (size == 0) {
4244 if (consumed)
4245 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4249 consumed, &has_errors);
4250 if (has_errors) {
4251 unicode = _PyUnicode_New(size);
4252 if (!unicode)
4253 return NULL;
4254 kind = PyUnicode_WCHAR_KIND;
4255 data = PyUnicode_AS_UNICODE(unicode);
4256 assert(data != NULL);
4257 }
4258 else {
4259 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4260 if (!unicode)
4261 return NULL;
4262 /* When the string is ASCII only, just use memcpy and return.
4263 unicode_size may be != size if there is an incomplete UTF-8
4264 sequence at the end of the ASCII block. */
4265 if (maxchar < 128 && size == unicode_size) {
4266 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4267 return (PyObject *)unicode;
4268 }
4269 kind = PyUnicode_KIND(unicode);
4270 data = PyUnicode_DATA(unicode);
4271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004275 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276
4277 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004278 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279
4280 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004281 /* Fast path for runs of ASCII characters. Given that common UTF-8
4282 input will consist of an overwhelming majority of ASCII
4283 characters, we try to optimize for this case by checking
4284 as many characters as a C 'long' can contain.
4285 First, check if we can do an aligned read, as most CPUs have
4286 a penalty for unaligned reads.
4287 */
4288 if (!((size_t) s & LONG_PTR_MASK)) {
4289 /* Help register allocation */
4290 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004292 while (_s < aligned_end) {
4293 /* Read a whole long at a time (either 4 or 8 bytes),
4294 and do a fast unrolled copy if it only contains ASCII
4295 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004296 unsigned long value = *(unsigned long *) _s;
4297 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004298 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004299 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4300 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4301 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4302 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004303#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004304 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4305 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4306 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4307 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004308#endif
4309 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004310 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004311 }
4312 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004313 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004314 if (s == e)
4315 break;
4316 ch = (unsigned char)*s;
4317 }
4318 }
4319
4320 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004321 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 s++;
4323 continue;
4324 }
4325
4326 n = utf8_code_length[ch];
4327
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004328 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 if (consumed)
4330 break;
4331 else {
4332 errmsg = "unexpected end of data";
4333 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004334 endinpos = startinpos+1;
4335 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4336 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 goto utf8Error;
4338 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340
4341 switch (n) {
4342
4343 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004344 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 startinpos = s-starts;
4346 endinpos = startinpos+1;
4347 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348
4349 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004350 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 startinpos = s-starts;
4352 endinpos = startinpos+1;
4353 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
4355 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004356 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004357 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004359 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 goto utf8Error;
4361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004363 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 break;
4366
4367 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004368 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4369 will result in surrogates in range d800-dfff. Surrogates are
4370 not valid UTF-8 so they are rejected.
4371 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4372 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004373 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004374 (s[2] & 0xc0) != 0x80 ||
4375 ((unsigned char)s[0] == 0xE0 &&
4376 (unsigned char)s[1] < 0xA0) ||
4377 ((unsigned char)s[0] == 0xED &&
4378 (unsigned char)s[1] > 0x9F)) {
4379 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004380 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004381 endinpos = startinpos + 1;
4382
4383 /* if s[1] first two bits are 1 and 0, then the invalid
4384 continuation byte is s[2], so increment endinpos by 1,
4385 if not, s[1] is invalid and endinpos doesn't need to
4386 be incremented. */
4387 if ((s[1] & 0xC0) == 0x80)
4388 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 goto utf8Error;
4390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004392 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004393 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004394 break;
4395
4396 case 4:
4397 if ((s[1] & 0xc0) != 0x80 ||
4398 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004399 (s[3] & 0xc0) != 0x80 ||
4400 ((unsigned char)s[0] == 0xF0 &&
4401 (unsigned char)s[1] < 0x90) ||
4402 ((unsigned char)s[0] == 0xF4 &&
4403 (unsigned char)s[1] > 0x8F)) {
4404 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004406 endinpos = startinpos + 1;
4407 if ((s[1] & 0xC0) == 0x80) {
4408 endinpos++;
4409 if ((s[2] & 0xC0) == 0x80)
4410 endinpos++;
4411 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto utf8Error;
4413 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004414 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004415 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4416 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004418 /* If the string is flexible or we have native UCS-4, write
4419 directly.. */
4420 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4421 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004423 else {
4424 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004426 /* translate from 10000..10FFFF to 0..FFFF */
4427 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004429 /* high surrogate = top 10 bits added to D800 */
4430 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4431 (Py_UNICODE)(0xD800 + (ch >> 10)));
4432
4433 /* low surrogate = bottom 10 bits added to DC00 */
4434 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4435 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4436 }
4437#if SIZEOF_WCHAR_T == 2
4438 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004439#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 }
4442 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004444
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004446 /* If this is not yet a resizable string, make it one.. */
4447 if (kind != PyUnicode_WCHAR_KIND) {
4448 const Py_UNICODE *u;
4449 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4450 if (!new_unicode)
4451 goto onError;
4452 u = PyUnicode_AsUnicode((PyObject *)unicode);
4453 if (!u)
4454 goto onError;
4455#if SIZEOF_WCHAR_T == 2
4456 i += wchar_offset;
4457#endif
4458 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4459 Py_DECREF(unicode);
4460 unicode = new_unicode;
4461 kind = 0;
4462 data = PyUnicode_AS_UNICODE(new_unicode);
4463 assert(data != NULL);
4464 }
4465 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 if (unicode_decode_call_errorhandler(
4467 errors, &errorHandler,
4468 "utf8", errmsg,
4469 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472 /* Update data because unicode_decode_call_errorhandler might have
4473 re-created or resized the unicode object. */
4474 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 /* Ensure the unicode_size calculation above was correct: */
4478 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4479
Walter Dörwald69652032004-09-07 20:24:22 +00004480 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004483 /* Adjust length and ready string when it contained errors and
4484 is of the old resizable kind. */
4485 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004486 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004487 goto onError;
4488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 Py_XDECREF(errorHandler);
4491 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004492#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004493 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004494 Py_DECREF(unicode);
4495 return NULL;
4496 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004497#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004498 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 return (PyObject *)unicode;
4500
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 Py_XDECREF(errorHandler);
4503 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 Py_DECREF(unicode);
4505 return NULL;
4506}
4507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004508#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004509
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004510#ifdef __APPLE__
4511
4512/* Simplified UTF-8 decoder using surrogateescape error handler,
4513 used to decode the command line arguments on Mac OS X. */
4514
4515wchar_t*
4516_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4517{
4518 int n;
4519 const char *e;
4520 wchar_t *unicode, *p;
4521
4522 /* Note: size will always be longer than the resulting Unicode
4523 character count */
4524 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4525 PyErr_NoMemory();
4526 return NULL;
4527 }
4528 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4529 if (!unicode)
4530 return NULL;
4531
4532 /* Unpack UTF-8 encoded data */
4533 p = unicode;
4534 e = s + size;
4535 while (s < e) {
4536 Py_UCS4 ch = (unsigned char)*s;
4537
4538 if (ch < 0x80) {
4539 *p++ = (wchar_t)ch;
4540 s++;
4541 continue;
4542 }
4543
4544 n = utf8_code_length[ch];
4545 if (s + n > e) {
4546 goto surrogateescape;
4547 }
4548
4549 switch (n) {
4550 case 0:
4551 case 1:
4552 goto surrogateescape;
4553
4554 case 2:
4555 if ((s[1] & 0xc0) != 0x80)
4556 goto surrogateescape;
4557 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4558 assert ((ch > 0x007F) && (ch <= 0x07FF));
4559 *p++ = (wchar_t)ch;
4560 break;
4561
4562 case 3:
4563 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4564 will result in surrogates in range d800-dfff. Surrogates are
4565 not valid UTF-8 so they are rejected.
4566 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4567 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4568 if ((s[1] & 0xc0) != 0x80 ||
4569 (s[2] & 0xc0) != 0x80 ||
4570 ((unsigned char)s[0] == 0xE0 &&
4571 (unsigned char)s[1] < 0xA0) ||
4572 ((unsigned char)s[0] == 0xED &&
4573 (unsigned char)s[1] > 0x9F)) {
4574
4575 goto surrogateescape;
4576 }
4577 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4578 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004580 break;
4581
4582 case 4:
4583 if ((s[1] & 0xc0) != 0x80 ||
4584 (s[2] & 0xc0) != 0x80 ||
4585 (s[3] & 0xc0) != 0x80 ||
4586 ((unsigned char)s[0] == 0xF0 &&
4587 (unsigned char)s[1] < 0x90) ||
4588 ((unsigned char)s[0] == 0xF4 &&
4589 (unsigned char)s[1] > 0x8F)) {
4590 goto surrogateescape;
4591 }
4592 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4593 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4594 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4595
4596#if SIZEOF_WCHAR_T == 4
4597 *p++ = (wchar_t)ch;
4598#else
4599 /* compute and append the two surrogates: */
4600
4601 /* translate from 10000..10FFFF to 0..FFFF */
4602 ch -= 0x10000;
4603
4604 /* high surrogate = top 10 bits added to D800 */
4605 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4606
4607 /* low surrogate = bottom 10 bits added to DC00 */
4608 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4609#endif
4610 break;
4611 }
4612 s += n;
4613 continue;
4614
4615 surrogateescape:
4616 *p++ = 0xDC00 + ch;
4617 s++;
4618 }
4619 *p = L'\0';
4620 return unicode;
4621}
4622
4623#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004625/* Primary internal function which creates utf8 encoded bytes objects.
4626
4627 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004628 and allocate exactly as much space needed at the end. Else allocate the
4629 maximum possible needed (4 result bytes per Unicode character), and return
4630 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004631*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004632PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004633_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634{
Tim Peters602f7402002-04-27 18:03:26 +00004635#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004636
Guido van Rossum98297ee2007-11-06 21:34:58 +00004637 Py_ssize_t i; /* index into s of next input byte */
4638 PyObject *result; /* result string object */
4639 char *p; /* next free byte in output buffer */
4640 Py_ssize_t nallocated; /* number of result bytes allocated */
4641 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004642 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004643 PyObject *errorHandler = NULL;
4644 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004645 int kind;
4646 void *data;
4647 Py_ssize_t size;
4648 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4649#if SIZEOF_WCHAR_T == 2
4650 Py_ssize_t wchar_offset = 0;
4651#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653 if (!PyUnicode_Check(unicode)) {
4654 PyErr_BadArgument();
4655 return NULL;
4656 }
4657
4658 if (PyUnicode_READY(unicode) == -1)
4659 return NULL;
4660
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004661 if (PyUnicode_UTF8(unicode))
4662 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4663 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004664
4665 kind = PyUnicode_KIND(unicode);
4666 data = PyUnicode_DATA(unicode);
4667 size = PyUnicode_GET_LENGTH(unicode);
4668
Tim Peters602f7402002-04-27 18:03:26 +00004669 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670
Tim Peters602f7402002-04-27 18:03:26 +00004671 if (size <= MAX_SHORT_UNICHARS) {
4672 /* Write into the stack buffer; nallocated can't overflow.
4673 * At the end, we'll allocate exactly as much heap space as it
4674 * turns out we need.
4675 */
4676 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004677 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004678 p = stackbuf;
4679 }
4680 else {
4681 /* Overallocate on the heap, and give the excess back at the end. */
4682 nallocated = size * 4;
4683 if (nallocated / 4 != size) /* overflow! */
4684 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004685 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004686 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004687 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004688 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004689 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004690
Tim Peters602f7402002-04-27 18:03:26 +00004691 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004692 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004693
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004694 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004695 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004697
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004699 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004700 *p++ = (char)(0xc0 | (ch >> 6));
4701 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004702 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004703 Py_ssize_t newpos;
4704 PyObject *rep;
4705 Py_ssize_t repsize, k, startpos;
4706 startpos = i-1;
4707#if SIZEOF_WCHAR_T == 2
4708 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004709#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004710 rep = unicode_encode_call_errorhandler(
4711 errors, &errorHandler, "utf-8", "surrogates not allowed",
4712 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4713 &exc, startpos, startpos+1, &newpos);
4714 if (!rep)
4715 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004717 if (PyBytes_Check(rep))
4718 repsize = PyBytes_GET_SIZE(rep);
4719 else
4720 repsize = PyUnicode_GET_SIZE(rep);
4721
4722 if (repsize > 4) {
4723 Py_ssize_t offset;
4724
4725 if (result == NULL)
4726 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004727 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004730 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4731 /* integer overflow */
4732 PyErr_NoMemory();
4733 goto error;
4734 }
4735 nallocated += repsize - 4;
4736 if (result != NULL) {
4737 if (_PyBytes_Resize(&result, nallocated) < 0)
4738 goto error;
4739 } else {
4740 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004741 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004742 goto error;
4743 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4744 }
4745 p = PyBytes_AS_STRING(result) + offset;
4746 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004748 if (PyBytes_Check(rep)) {
4749 char *prep = PyBytes_AS_STRING(rep);
4750 for(k = repsize; k > 0; k--)
4751 *p++ = *prep++;
4752 } else /* rep is unicode */ {
4753 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4754 Py_UNICODE c;
4755
4756 for(k=0; k<repsize; k++) {
4757 c = prep[k];
4758 if (0x80 <= c) {
4759 raise_encode_exception(&exc, "utf-8",
4760 PyUnicode_AS_UNICODE(unicode),
4761 size, i-1, i,
4762 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004763 goto error;
4764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004766 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004769 } else if (ch < 0x10000) {
4770 *p++ = (char)(0xe0 | (ch >> 12));
4771 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4772 *p++ = (char)(0x80 | (ch & 0x3f));
4773 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004774 /* Encode UCS4 Unicode ordinals */
4775 *p++ = (char)(0xf0 | (ch >> 18));
4776 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4777 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4778 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004779#if SIZEOF_WCHAR_T == 2
4780 wchar_offset++;
4781#endif
Tim Peters602f7402002-04-27 18:03:26 +00004782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004784
Guido van Rossum98297ee2007-11-06 21:34:58 +00004785 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004786 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004787 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004788 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004789 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004790 }
4791 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004792 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004793 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004794 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004795 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004798 Py_XDECREF(errorHandler);
4799 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004800 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004801 error:
4802 Py_XDECREF(errorHandler);
4803 Py_XDECREF(exc);
4804 Py_XDECREF(result);
4805 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004806
Tim Peters602f7402002-04-27 18:03:26 +00004807#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808}
4809
Alexander Belopolsky40018472011-02-26 01:02:56 +00004810PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4812 Py_ssize_t size,
4813 const char *errors)
4814{
4815 PyObject *v, *unicode;
4816
4817 unicode = PyUnicode_FromUnicode(s, size);
4818 if (unicode == NULL)
4819 return NULL;
4820 v = _PyUnicode_AsUTF8String(unicode, errors);
4821 Py_DECREF(unicode);
4822 return v;
4823}
4824
4825PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004826PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829}
4830
Walter Dörwald41980ca2007-08-16 21:55:45 +00004831/* --- UTF-32 Codec ------------------------------------------------------- */
4832
4833PyObject *
4834PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 Py_ssize_t size,
4836 const char *errors,
4837 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004838{
4839 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4840}
4841
4842PyObject *
4843PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 Py_ssize_t size,
4845 const char *errors,
4846 int *byteorder,
4847 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848{
4849 const char *starts = s;
4850 Py_ssize_t startinpos;
4851 Py_ssize_t endinpos;
4852 Py_ssize_t outpos;
4853 PyUnicodeObject *unicode;
4854 Py_UNICODE *p;
4855#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004856 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004857 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004858#else
4859 const int pairs = 0;
4860#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004861 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004862 int bo = 0; /* assume native ordering by default */
4863 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864 /* Offsets from q for retrieving bytes in the right order. */
4865#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4866 int iorder[] = {0, 1, 2, 3};
4867#else
4868 int iorder[] = {3, 2, 1, 0};
4869#endif
4870 PyObject *errorHandler = NULL;
4871 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004872
Walter Dörwald41980ca2007-08-16 21:55:45 +00004873 q = (unsigned char *)s;
4874 e = q + size;
4875
4876 if (byteorder)
4877 bo = *byteorder;
4878
4879 /* Check for BOM marks (U+FEFF) in the input and adjust current
4880 byte order setting accordingly. In native mode, the leading BOM
4881 mark is skipped, in all other modes, it is copied to the output
4882 stream as-is (giving a ZWNBSP character). */
4883 if (bo == 0) {
4884 if (size >= 4) {
4885 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004887#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (bom == 0x0000FEFF) {
4889 q += 4;
4890 bo = -1;
4891 }
4892 else if (bom == 0xFFFE0000) {
4893 q += 4;
4894 bo = 1;
4895 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004896#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 if (bom == 0x0000FEFF) {
4898 q += 4;
4899 bo = 1;
4900 }
4901 else if (bom == 0xFFFE0000) {
4902 q += 4;
4903 bo = -1;
4904 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004907 }
4908
4909 if (bo == -1) {
4910 /* force LE */
4911 iorder[0] = 0;
4912 iorder[1] = 1;
4913 iorder[2] = 2;
4914 iorder[3] = 3;
4915 }
4916 else if (bo == 1) {
4917 /* force BE */
4918 iorder[0] = 3;
4919 iorder[1] = 2;
4920 iorder[2] = 1;
4921 iorder[3] = 0;
4922 }
4923
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004924 /* On narrow builds we split characters outside the BMP into two
4925 codepoints => count how much extra space we need. */
4926#ifndef Py_UNICODE_WIDE
4927 for (qq = q; qq < e; qq += 4)
4928 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4929 pairs++;
4930#endif
4931
4932 /* This might be one to much, because of a BOM */
4933 unicode = _PyUnicode_New((size+3)/4+pairs);
4934 if (!unicode)
4935 return NULL;
4936 if (size == 0)
4937 return (PyObject *)unicode;
4938
4939 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004940 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004941
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 Py_UCS4 ch;
4944 /* remaining bytes at the end? (size should be divisible by 4) */
4945 if (e-q<4) {
4946 if (consumed)
4947 break;
4948 errmsg = "truncated data";
4949 startinpos = ((const char *)q)-starts;
4950 endinpos = ((const char *)e)-starts;
4951 goto utf32Error;
4952 /* The remaining input chars are ignored if the callback
4953 chooses to skip the input */
4954 }
4955 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4956 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 if (ch >= 0x110000)
4959 {
4960 errmsg = "codepoint not in range(0x110000)";
4961 startinpos = ((const char *)q)-starts;
4962 endinpos = startinpos+4;
4963 goto utf32Error;
4964 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004965#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 if (ch >= 0x10000)
4967 {
4968 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4969 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4970 }
4971 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 *p++ = ch;
4974 q += 4;
4975 continue;
4976 utf32Error:
4977 outpos = p-PyUnicode_AS_UNICODE(unicode);
4978 if (unicode_decode_call_errorhandler(
4979 errors, &errorHandler,
4980 "utf32", errmsg,
4981 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4982 &unicode, &outpos, &p))
4983 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
4986 if (byteorder)
4987 *byteorder = bo;
4988
4989 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991
4992 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004993 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994 goto onError;
4995
4996 Py_XDECREF(errorHandler);
4997 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004998#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004999 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005000 Py_DECREF(unicode);
5001 return NULL;
5002 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005003#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005004 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005 return (PyObject *)unicode;
5006
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008 Py_DECREF(unicode);
5009 Py_XDECREF(errorHandler);
5010 Py_XDECREF(exc);
5011 return NULL;
5012}
5013
5014PyObject *
5015PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 Py_ssize_t size,
5017 const char *errors,
5018 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005019{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005020 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005021 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005022 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005023#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005024 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005025#else
5026 const int pairs = 0;
5027#endif
5028 /* Offsets from p for storing byte pairs in the right order. */
5029#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5030 int iorder[] = {0, 1, 2, 3};
5031#else
5032 int iorder[] = {3, 2, 1, 0};
5033#endif
5034
Benjamin Peterson29060642009-01-31 22:14:21 +00005035#define STORECHAR(CH) \
5036 do { \
5037 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5038 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5039 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5040 p[iorder[0]] = (CH) & 0xff; \
5041 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005042 } while(0)
5043
5044 /* In narrow builds we can output surrogate pairs as one codepoint,
5045 so we need less space. */
5046#ifndef Py_UNICODE_WIDE
5047 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5049 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5050 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005052 nsize = (size - pairs + (byteorder == 0));
5053 bytesize = nsize * 4;
5054 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005056 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 if (v == NULL)
5058 return NULL;
5059
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005060 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005064 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065
5066 if (byteorder == -1) {
5067 /* force LE */
5068 iorder[0] = 0;
5069 iorder[1] = 1;
5070 iorder[2] = 2;
5071 iorder[3] = 3;
5072 }
5073 else if (byteorder == 1) {
5074 /* force BE */
5075 iorder[0] = 3;
5076 iorder[1] = 2;
5077 iorder[2] = 1;
5078 iorder[3] = 0;
5079 }
5080
5081 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5085 Py_UCS4 ch2 = *s;
5086 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5087 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5088 s++;
5089 size--;
5090 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005091 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092#endif
5093 STORECHAR(ch);
5094 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005095
5096 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098#undef STORECHAR
5099}
5100
Alexander Belopolsky40018472011-02-26 01:02:56 +00005101PyObject *
5102PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103{
5104 if (!PyUnicode_Check(unicode)) {
5105 PyErr_BadArgument();
5106 return NULL;
5107 }
5108 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 PyUnicode_GET_SIZE(unicode),
5110 NULL,
5111 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112}
5113
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114/* --- UTF-16 Codec ------------------------------------------------------- */
5115
Tim Peters772747b2001-08-09 22:21:55 +00005116PyObject *
5117PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 Py_ssize_t size,
5119 const char *errors,
5120 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121{
Walter Dörwald69652032004-09-07 20:24:22 +00005122 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5123}
5124
Antoine Pitrouab868312009-01-10 15:40:25 +00005125/* Two masks for fast checking of whether a C 'long' may contain
5126 UTF16-encoded surrogate characters. This is an efficient heuristic,
5127 assuming that non-surrogate characters with a code point >= 0x8000 are
5128 rare in most input.
5129 FAST_CHAR_MASK is used when the input is in native byte ordering,
5130 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005131*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005132#if (SIZEOF_LONG == 8)
5133# define FAST_CHAR_MASK 0x8000800080008000L
5134# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5135#elif (SIZEOF_LONG == 4)
5136# define FAST_CHAR_MASK 0x80008000L
5137# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5138#else
5139# error C 'long' size should be either 4 or 8!
5140#endif
5141
Walter Dörwald69652032004-09-07 20:24:22 +00005142PyObject *
5143PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 Py_ssize_t size,
5145 const char *errors,
5146 int *byteorder,
5147 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005148{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005150 Py_ssize_t startinpos;
5151 Py_ssize_t endinpos;
5152 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 PyUnicodeObject *unicode;
5154 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005155 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005156 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005157 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005158 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005159 /* Offsets from q for retrieving byte pairs in the right order. */
5160#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5161 int ihi = 1, ilo = 0;
5162#else
5163 int ihi = 0, ilo = 1;
5164#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165 PyObject *errorHandler = NULL;
5166 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167
5168 /* Note: size will always be longer than the resulting Unicode
5169 character count */
5170 unicode = _PyUnicode_New(size);
5171 if (!unicode)
5172 return NULL;
5173 if (size == 0)
5174 return (PyObject *)unicode;
5175
5176 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005178 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005179 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180
5181 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005182 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005184 /* Check for BOM marks (U+FEFF) in the input and adjust current
5185 byte order setting accordingly. In native mode, the leading BOM
5186 mark is skipped, in all other modes, it is copied to the output
5187 stream as-is (giving a ZWNBSP character). */
5188 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005189 if (size >= 2) {
5190 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005191#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 if (bom == 0xFEFF) {
5193 q += 2;
5194 bo = -1;
5195 }
5196 else if (bom == 0xFFFE) {
5197 q += 2;
5198 bo = 1;
5199 }
Tim Petersced69f82003-09-16 20:30:58 +00005200#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 if (bom == 0xFEFF) {
5202 q += 2;
5203 bo = 1;
5204 }
5205 else if (bom == 0xFFFE) {
5206 q += 2;
5207 bo = -1;
5208 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005209#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
Tim Peters772747b2001-08-09 22:21:55 +00005213 if (bo == -1) {
5214 /* force LE */
5215 ihi = 1;
5216 ilo = 0;
5217 }
5218 else if (bo == 1) {
5219 /* force BE */
5220 ihi = 0;
5221 ilo = 1;
5222 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005223#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5224 native_ordering = ilo < ihi;
5225#else
5226 native_ordering = ilo > ihi;
5227#endif
Tim Peters772747b2001-08-09 22:21:55 +00005228
Antoine Pitrouab868312009-01-10 15:40:25 +00005229 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005230 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005232 /* First check for possible aligned read of a C 'long'. Unaligned
5233 reads are more expensive, better to defer to another iteration. */
5234 if (!((size_t) q & LONG_PTR_MASK)) {
5235 /* Fast path for runs of non-surrogate chars. */
5236 register const unsigned char *_q = q;
5237 Py_UNICODE *_p = p;
5238 if (native_ordering) {
5239 /* Native ordering is simple: as long as the input cannot
5240 possibly contain a surrogate char, do an unrolled copy
5241 of several 16-bit code points to the target object.
5242 The non-surrogate check is done on several input bytes
5243 at a time (as many as a C 'long' can contain). */
5244 while (_q < aligned_end) {
5245 unsigned long data = * (unsigned long *) _q;
5246 if (data & FAST_CHAR_MASK)
5247 break;
5248 _p[0] = ((unsigned short *) _q)[0];
5249 _p[1] = ((unsigned short *) _q)[1];
5250#if (SIZEOF_LONG == 8)
5251 _p[2] = ((unsigned short *) _q)[2];
5252 _p[3] = ((unsigned short *) _q)[3];
5253#endif
5254 _q += SIZEOF_LONG;
5255 _p += SIZEOF_LONG / 2;
5256 }
5257 }
5258 else {
5259 /* Byteswapped ordering is similar, but we must decompose
5260 the copy bytewise, and take care of zero'ing out the
5261 upper bytes if the target object is in 32-bit units
5262 (that is, in UCS-4 builds). */
5263 while (_q < aligned_end) {
5264 unsigned long data = * (unsigned long *) _q;
5265 if (data & SWAPPED_FAST_CHAR_MASK)
5266 break;
5267 /* Zero upper bytes in UCS-4 builds */
5268#if (Py_UNICODE_SIZE > 2)
5269 _p[0] = 0;
5270 _p[1] = 0;
5271#if (SIZEOF_LONG == 8)
5272 _p[2] = 0;
5273 _p[3] = 0;
5274#endif
5275#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005276 /* Issue #4916; UCS-4 builds on big endian machines must
5277 fill the two last bytes of each 4-byte unit. */
5278#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5279# define OFF 2
5280#else
5281# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005282#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005283 ((unsigned char *) _p)[OFF + 1] = _q[0];
5284 ((unsigned char *) _p)[OFF + 0] = _q[1];
5285 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5286 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5287#if (SIZEOF_LONG == 8)
5288 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5289 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5290 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5291 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5292#endif
5293#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005294 _q += SIZEOF_LONG;
5295 _p += SIZEOF_LONG / 2;
5296 }
5297 }
5298 p = _p;
5299 q = _q;
5300 if (q >= e)
5301 break;
5302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304
Benjamin Peterson14339b62009-01-31 16:36:08 +00005305 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005306
5307 if (ch < 0xD800 || ch > 0xDFFF) {
5308 *p++ = ch;
5309 continue;
5310 }
5311
5312 /* UTF-16 code pair: */
5313 if (q > e) {
5314 errmsg = "unexpected end of data";
5315 startinpos = (((const char *)q) - 2) - starts;
5316 endinpos = ((const char *)e) + 1 - starts;
5317 goto utf16Error;
5318 }
5319 if (0xD800 <= ch && ch <= 0xDBFF) {
5320 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5321 q += 2;
5322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005323#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 *p++ = ch;
5325 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005326#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005328#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 continue;
5330 }
5331 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005332 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 startinpos = (((const char *)q)-4)-starts;
5334 endinpos = startinpos+2;
5335 goto utf16Error;
5336 }
5337
Benjamin Peterson14339b62009-01-31 16:36:08 +00005338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 errmsg = "illegal encoding";
5340 startinpos = (((const char *)q)-2)-starts;
5341 endinpos = startinpos+2;
5342 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005343
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 utf16Error:
5345 outpos = p - PyUnicode_AS_UNICODE(unicode);
5346 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005347 errors,
5348 &errorHandler,
5349 "utf16", errmsg,
5350 &starts,
5351 (const char **)&e,
5352 &startinpos,
5353 &endinpos,
5354 &exc,
5355 (const char **)&q,
5356 &unicode,
5357 &outpos,
5358 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005361 /* remaining byte at the end? (size should be even) */
5362 if (e == q) {
5363 if (!consumed) {
5364 errmsg = "truncated data";
5365 startinpos = ((const char *)q) - starts;
5366 endinpos = ((const char *)e) + 1 - starts;
5367 outpos = p - PyUnicode_AS_UNICODE(unicode);
5368 if (unicode_decode_call_errorhandler(
5369 errors,
5370 &errorHandler,
5371 "utf16", errmsg,
5372 &starts,
5373 (const char **)&e,
5374 &startinpos,
5375 &endinpos,
5376 &exc,
5377 (const char **)&q,
5378 &unicode,
5379 &outpos,
5380 &p))
5381 goto onError;
5382 /* The remaining input chars are ignored if the callback
5383 chooses to skip the input */
5384 }
5385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386
5387 if (byteorder)
5388 *byteorder = bo;
5389
Walter Dörwald69652032004-09-07 20:24:22 +00005390 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005394 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 goto onError;
5396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 Py_XDECREF(errorHandler);
5398 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005399#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005400 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005401 Py_DECREF(unicode);
5402 return NULL;
5403 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005404#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005405 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 return (PyObject *)unicode;
5407
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 Py_XDECREF(errorHandler);
5411 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 return NULL;
5413}
5414
Antoine Pitrouab868312009-01-10 15:40:25 +00005415#undef FAST_CHAR_MASK
5416#undef SWAPPED_FAST_CHAR_MASK
5417
Tim Peters772747b2001-08-09 22:21:55 +00005418PyObject *
5419PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 Py_ssize_t size,
5421 const char *errors,
5422 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005424 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005425 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005426 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005427#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005428 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005429#else
5430 const int pairs = 0;
5431#endif
Tim Peters772747b2001-08-09 22:21:55 +00005432 /* Offsets from p for storing byte pairs in the right order. */
5433#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5434 int ihi = 1, ilo = 0;
5435#else
5436 int ihi = 0, ilo = 1;
5437#endif
5438
Benjamin Peterson29060642009-01-31 22:14:21 +00005439#define STORECHAR(CH) \
5440 do { \
5441 p[ihi] = ((CH) >> 8) & 0xff; \
5442 p[ilo] = (CH) & 0xff; \
5443 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005444 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005446#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005447 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 if (s[i] >= 0x10000)
5449 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005450#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005451 /* 2 * (size + pairs + (byteorder == 0)) */
5452 if (size > PY_SSIZE_T_MAX ||
5453 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005455 nsize = size + pairs + (byteorder == 0);
5456 bytesize = nsize * 2;
5457 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005459 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 if (v == NULL)
5461 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005463 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005466 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005467 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005468
5469 if (byteorder == -1) {
5470 /* force LE */
5471 ihi = 1;
5472 ilo = 0;
5473 }
5474 else if (byteorder == 1) {
5475 /* force BE */
5476 ihi = 0;
5477 ilo = 1;
5478 }
5479
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005480 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 Py_UNICODE ch = *s++;
5482 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005483#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 if (ch >= 0x10000) {
5485 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5486 ch = 0xD800 | ((ch-0x10000) >> 10);
5487 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005488#endif
Tim Peters772747b2001-08-09 22:21:55 +00005489 STORECHAR(ch);
5490 if (ch2)
5491 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005492 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005493
5494 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005495 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005496#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497}
5498
Alexander Belopolsky40018472011-02-26 01:02:56 +00005499PyObject *
5500PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501{
5502 if (!PyUnicode_Check(unicode)) {
5503 PyErr_BadArgument();
5504 return NULL;
5505 }
5506 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 PyUnicode_GET_SIZE(unicode),
5508 NULL,
5509 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510}
5511
5512/* --- Unicode Escape Codec ----------------------------------------------- */
5513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005514/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5515 if all the escapes in the string make it still a valid ASCII string.
5516 Returns -1 if any escapes were found which cause the string to
5517 pop out of ASCII range. Otherwise returns the length of the
5518 required buffer to hold the string.
5519 */
5520Py_ssize_t
5521length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5522{
5523 const unsigned char *p = (const unsigned char *)s;
5524 const unsigned char *end = p + size;
5525 Py_ssize_t length = 0;
5526
5527 if (size < 0)
5528 return -1;
5529
5530 for (; p < end; ++p) {
5531 if (*p > 127) {
5532 /* Non-ASCII */
5533 return -1;
5534 }
5535 else if (*p != '\\') {
5536 /* Normal character */
5537 ++length;
5538 }
5539 else {
5540 /* Backslash-escape, check next char */
5541 ++p;
5542 /* Escape sequence reaches till end of string or
5543 non-ASCII follow-up. */
5544 if (p >= end || *p > 127)
5545 return -1;
5546 switch (*p) {
5547 case '\n':
5548 /* backslash + \n result in zero characters */
5549 break;
5550 case '\\': case '\'': case '\"':
5551 case 'b': case 'f': case 't':
5552 case 'n': case 'r': case 'v': case 'a':
5553 ++length;
5554 break;
5555 case '0': case '1': case '2': case '3':
5556 case '4': case '5': case '6': case '7':
5557 case 'x': case 'u': case 'U': case 'N':
5558 /* these do not guarantee ASCII characters */
5559 return -1;
5560 default:
5561 /* count the backslash + the other character */
5562 length += 2;
5563 }
5564 }
5565 }
5566 return length;
5567}
5568
5569/* Similar to PyUnicode_WRITE but either write into wstr field
5570 or treat string as ASCII. */
5571#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5572 do { \
5573 if ((kind) != PyUnicode_WCHAR_KIND) \
5574 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5575 else \
5576 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5577 } while (0)
5578
5579#define WRITE_WSTR(buf, index, value) \
5580 assert(kind == PyUnicode_WCHAR_KIND), \
5581 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5582
5583
Fredrik Lundh06d12682001-01-24 07:59:11 +00005584static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005585
Alexander Belopolsky40018472011-02-26 01:02:56 +00005586PyObject *
5587PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005588 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005592 Py_ssize_t startinpos;
5593 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005594 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005598 char* message;
5599 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 PyObject *errorHandler = NULL;
5601 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602 Py_ssize_t ascii_length;
5603 Py_ssize_t i;
5604 int kind;
5605 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005607 ascii_length = length_of_escaped_ascii_string(s, size);
5608
5609 /* After length_of_escaped_ascii_string() there are two alternatives,
5610 either the string is pure ASCII with named escapes like \n, etc.
5611 and we determined it's exact size (common case)
5612 or it contains \x, \u, ... escape sequences. then we create a
5613 legacy wchar string and resize it at the end of this function. */
5614 if (ascii_length >= 0) {
5615 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5616 if (!v)
5617 goto onError;
5618 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5619 kind = PyUnicode_1BYTE_KIND;
5620 data = PyUnicode_DATA(v);
5621 }
5622 else {
5623 /* Escaped strings will always be longer than the resulting
5624 Unicode string, so we start with size here and then reduce the
5625 length after conversion to the true value.
5626 (but if the error callback returns a long replacement string
5627 we'll have to allocate more space) */
5628 v = _PyUnicode_New(size);
5629 if (!v)
5630 goto onError;
5631 kind = PyUnicode_WCHAR_KIND;
5632 data = PyUnicode_AS_UNICODE(v);
5633 }
5634
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 if (size == 0)
5636 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005637 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005639
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 while (s < end) {
5641 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005642 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 if (kind == PyUnicode_WCHAR_KIND) {
5646 assert(i < _PyUnicode_WSTR_LENGTH(v));
5647 }
5648 else {
5649 /* The only case in which i == ascii_length is a backslash
5650 followed by a newline. */
5651 assert(i <= ascii_length);
5652 }
5653
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 /* Non-escape characters are interpreted as Unicode ordinals */
5655 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 continue;
5658 }
5659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 /* \ - Escapes */
5662 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005663 c = *s++;
5664 if (s > end)
5665 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005666
5667 if (kind == PyUnicode_WCHAR_KIND) {
5668 assert(i < _PyUnicode_WSTR_LENGTH(v));
5669 }
5670 else {
5671 /* The only case in which i == ascii_length is a backslash
5672 followed by a newline. */
5673 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5674 }
5675
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005676 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005680 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5681 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5682 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5683 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5684 /* FF */
5685 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5686 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5687 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5688 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5689 /* VT */
5690 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5691 /* BEL, not classic C */
5692 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 case '0': case '1': case '2': case '3':
5696 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005697 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005698 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005699 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005700 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005701 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 break;
5705
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 /* hex escapes */
5707 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005709 digits = 2;
5710 message = "truncated \\xXX escape";
5711 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005715 digits = 4;
5716 message = "truncated \\uXXXX escape";
5717 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005720 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005721 digits = 8;
5722 message = "truncated \\UXXXXXXXX escape";
5723 hexescape:
5724 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005725 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 if (s+digits>end) {
5727 endinpos = size;
5728 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 errors, &errorHandler,
5730 "unicodeescape", "end of string in escape sequence",
5731 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005732 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005734 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 goto nextByte;
5736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737 for (j = 0; j < digits; ++j) {
5738 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005739 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740 endinpos = (s+j+1)-starts;
5741 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 errors, &errorHandler,
5744 "unicodeescape", message,
5745 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005746 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005747 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005748 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005750 }
5751 chr = (chr<<4) & ~0xF;
5752 if (c >= '0' && c <= '9')
5753 chr += c - '0';
5754 else if (c >= 'a' && c <= 'f')
5755 chr += 10 + c - 'a';
5756 else
5757 chr += 10 + c - 'A';
5758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005759 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005760 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 /* _decoding_error will have already written into the
5762 target buffer. */
5763 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005765 /* when we get here, chr is a 32-bit unicode character */
5766 if (chr <= 0xffff)
5767 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005768 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005769 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005770 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005771 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005772#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005773 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005774#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005775 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005776 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5777 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005778#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005779 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 errors, &errorHandler,
5784 "unicodeescape", "illegal Unicode character",
5785 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005787 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005789 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005790 break;
5791
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005793 case 'N':
5794 message = "malformed \\N character escape";
5795 if (ucnhash_CAPI == NULL) {
5796 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5798 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005799 if (ucnhash_CAPI == NULL)
5800 goto ucnhashError;
5801 }
5802 if (*s == '{') {
5803 const char *start = s+1;
5804 /* look for the closing brace */
5805 while (*s != '}' && s < end)
5806 s++;
5807 if (s > start && s < end && *s == '}') {
5808 /* found a name. look it up in the unicode database */
5809 message = "unknown Unicode character name";
5810 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005811 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5812 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 goto store;
5814 }
5815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005817 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 errors, &errorHandler,
5820 "unicodeescape", message,
5821 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005822 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005823 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005825 break;
5826
5827 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005828 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005829 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 message = "\\ at end of string";
5831 s--;
5832 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005833 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 errors, &errorHandler,
5836 "unicodeescape", message,
5837 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005839 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005840 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005841 }
5842 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005843 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5844 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005845 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005846 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005851 /* Ensure the length prediction worked in case of ASCII strings */
5852 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5853
Victor Stinnerfe226c02011-10-03 03:52:20 +02005854 if (kind == PyUnicode_WCHAR_KIND)
5855 {
5856 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5857 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005858 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005861#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005862 if (_PyUnicode_READY_REPLACE(&v)) {
5863 Py_DECREF(v);
5864 return NULL;
5865 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005866#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005867 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005871 PyErr_SetString(
5872 PyExc_UnicodeError,
5873 "\\N escapes not supported (can't load unicodedata module)"
5874 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005875 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005878 return NULL;
5879
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 Py_XDECREF(errorHandler);
5883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 return NULL;
5885}
5886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887#undef WRITE_ASCII_OR_WSTR
5888#undef WRITE_WSTR
5889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890/* Return a Unicode-Escape string version of the Unicode object.
5891
5892 If quotes is true, the string is enclosed in u"" or u'' quotes as
5893 appropriate.
5894
5895*/
5896
Walter Dörwald79e913e2007-05-12 11:08:06 +00005897static const char *hexdigits = "0123456789abcdef";
5898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
5900PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005901 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005903 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005906#ifdef Py_UNICODE_WIDE
5907 const Py_ssize_t expandsize = 10;
5908#else
5909 const Py_ssize_t expandsize = 6;
5910#endif
5911
Thomas Wouters89f507f2006-12-13 04:49:30 +00005912 /* XXX(nnorwitz): rather than over-allocating, it would be
5913 better to choose a different scheme. Perhaps scan the
5914 first N-chars of the string and allocate based on that size.
5915 */
5916 /* Initial allocation is based on the longest-possible unichr
5917 escape.
5918
5919 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5920 unichr, so in this case it's the longest unichr escape. In
5921 narrow (UTF-16) builds this is five chars per source unichr
5922 since there are two unichrs in the surrogate pair, so in narrow
5923 (UTF-16) builds it's not the longest unichr escape.
5924
5925 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5926 so in the narrow (UTF-16) build case it's the longest unichr
5927 escape.
5928 */
5929
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005930 if (size == 0)
5931 return PyBytes_FromStringAndSize(NULL, 0);
5932
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005933 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005935
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005936 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 2
5938 + expandsize*size
5939 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 if (repr == NULL)
5941 return NULL;
5942
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005943 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 while (size-- > 0) {
5946 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005947
Walter Dörwald79e913e2007-05-12 11:08:06 +00005948 /* Escape backslashes */
5949 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 *p++ = '\\';
5951 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005952 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005953 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005954
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005955#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005956 /* Map 21-bit characters to '\U00xxxxxx' */
5957 else if (ch >= 0x10000) {
5958 *p++ = '\\';
5959 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005960 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5961 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5962 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5963 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5964 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5965 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5966 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5967 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005969 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005970#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5972 else if (ch >= 0xD800 && ch < 0xDC00) {
5973 Py_UNICODE ch2;
5974 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005975
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 ch2 = *s++;
5977 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005978 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5980 *p++ = '\\';
5981 *p++ = 'U';
5982 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5983 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5984 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5985 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5986 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5987 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5988 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5989 *p++ = hexdigits[ucs & 0x0000000F];
5990 continue;
5991 }
5992 /* Fall through: isolated surrogates are copied as-is */
5993 s--;
5994 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005995 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005996#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005999 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 *p++ = '\\';
6001 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006002 *p++ = hexdigits[(ch >> 12) & 0x000F];
6003 *p++ = hexdigits[(ch >> 8) & 0x000F];
6004 *p++ = hexdigits[(ch >> 4) & 0x000F];
6005 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006007
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006008 /* Map special whitespace to '\t', \n', '\r' */
6009 else if (ch == '\t') {
6010 *p++ = '\\';
6011 *p++ = 't';
6012 }
6013 else if (ch == '\n') {
6014 *p++ = '\\';
6015 *p++ = 'n';
6016 }
6017 else if (ch == '\r') {
6018 *p++ = '\\';
6019 *p++ = 'r';
6020 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006021
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006022 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006023 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006025 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006026 *p++ = hexdigits[(ch >> 4) & 0x000F];
6027 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006028 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* Copy everything else as-is */
6031 else
6032 *p++ = (char) ch;
6033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 assert(p - PyBytes_AS_STRING(repr) > 0);
6036 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6037 return NULL;
6038 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039}
6040
Alexander Belopolsky40018472011-02-26 01:02:56 +00006041PyObject *
6042PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006044 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 if (!PyUnicode_Check(unicode)) {
6046 PyErr_BadArgument();
6047 return NULL;
6048 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006049 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6050 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006051 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052}
6053
6054/* --- Raw Unicode Escape Codec ------------------------------------------- */
6055
Alexander Belopolsky40018472011-02-26 01:02:56 +00006056PyObject *
6057PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006058 Py_ssize_t size,
6059 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006062 Py_ssize_t startinpos;
6063 Py_ssize_t endinpos;
6064 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 const char *end;
6068 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 PyObject *errorHandler = NULL;
6070 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 /* Escaped strings will always be longer than the resulting
6073 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 length after conversion to the true value. (But decoding error
6075 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 v = _PyUnicode_New(size);
6077 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 end = s + size;
6083 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 unsigned char c;
6085 Py_UCS4 x;
6086 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006087 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 /* Non-escape characters are interpreted as Unicode ordinals */
6090 if (*s != '\\') {
6091 *p++ = (unsigned char)*s++;
6092 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 startinpos = s-starts;
6095
6096 /* \u-escapes are only interpreted iff the number of leading
6097 backslashes if odd */
6098 bs = s;
6099 for (;s < end;) {
6100 if (*s != '\\')
6101 break;
6102 *p++ = (unsigned char)*s++;
6103 }
6104 if (((s - bs) & 1) == 0 ||
6105 s >= end ||
6106 (*s != 'u' && *s != 'U')) {
6107 continue;
6108 }
6109 p--;
6110 count = *s=='u' ? 4 : 8;
6111 s++;
6112
6113 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6114 outpos = p-PyUnicode_AS_UNICODE(v);
6115 for (x = 0, i = 0; i < count; ++i, ++s) {
6116 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006117 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 endinpos = s-starts;
6119 if (unicode_decode_call_errorhandler(
6120 errors, &errorHandler,
6121 "rawunicodeescape", "truncated \\uXXXX",
6122 &starts, &end, &startinpos, &endinpos, &exc, &s,
6123 &v, &outpos, &p))
6124 goto onError;
6125 goto nextByte;
6126 }
6127 x = (x<<4) & ~0xF;
6128 if (c >= '0' && c <= '9')
6129 x += c - '0';
6130 else if (c >= 'a' && c <= 'f')
6131 x += 10 + c - 'a';
6132 else
6133 x += 10 + c - 'A';
6134 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006135 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 /* UCS-2 character */
6137 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006138 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 /* UCS-4 character. Either store directly, or as
6140 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006141#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006143#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 x -= 0x10000L;
6145 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6146 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006147#endif
6148 } else {
6149 endinpos = s-starts;
6150 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006151 if (unicode_decode_call_errorhandler(
6152 errors, &errorHandler,
6153 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 &starts, &end, &startinpos, &endinpos, &exc, &s,
6155 &v, &outpos, &p))
6156 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006157 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 nextByte:
6159 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006161 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 Py_XDECREF(errorHandler);
6164 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006165#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006166 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006167 Py_DECREF(v);
6168 return NULL;
6169 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006170#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006171 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006173
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 Py_XDECREF(errorHandler);
6177 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 return NULL;
6179}
6180
Alexander Belopolsky40018472011-02-26 01:02:56 +00006181PyObject *
6182PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006183 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006185 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 char *p;
6187 char *q;
6188
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006189#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006190 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006191#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006192 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006193#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006194
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006195 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006197
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 if (repr == NULL)
6200 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006201 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006202 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006204 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 while (size-- > 0) {
6206 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006207#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 /* Map 32-bit characters to '\Uxxxxxxxx' */
6209 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006210 *p++ = '\\';
6211 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006212 *p++ = hexdigits[(ch >> 28) & 0xf];
6213 *p++ = hexdigits[(ch >> 24) & 0xf];
6214 *p++ = hexdigits[(ch >> 20) & 0xf];
6215 *p++ = hexdigits[(ch >> 16) & 0xf];
6216 *p++ = hexdigits[(ch >> 12) & 0xf];
6217 *p++ = hexdigits[(ch >> 8) & 0xf];
6218 *p++ = hexdigits[(ch >> 4) & 0xf];
6219 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006220 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006221 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006222#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6224 if (ch >= 0xD800 && ch < 0xDC00) {
6225 Py_UNICODE ch2;
6226 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006227
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 ch2 = *s++;
6229 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006230 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6232 *p++ = '\\';
6233 *p++ = 'U';
6234 *p++ = hexdigits[(ucs >> 28) & 0xf];
6235 *p++ = hexdigits[(ucs >> 24) & 0xf];
6236 *p++ = hexdigits[(ucs >> 20) & 0xf];
6237 *p++ = hexdigits[(ucs >> 16) & 0xf];
6238 *p++ = hexdigits[(ucs >> 12) & 0xf];
6239 *p++ = hexdigits[(ucs >> 8) & 0xf];
6240 *p++ = hexdigits[(ucs >> 4) & 0xf];
6241 *p++ = hexdigits[ucs & 0xf];
6242 continue;
6243 }
6244 /* Fall through: isolated surrogates are copied as-is */
6245 s--;
6246 size++;
6247 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006248#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 /* Map 16-bit characters to '\uxxxx' */
6250 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 *p++ = '\\';
6252 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006253 *p++ = hexdigits[(ch >> 12) & 0xf];
6254 *p++ = hexdigits[(ch >> 8) & 0xf];
6255 *p++ = hexdigits[(ch >> 4) & 0xf];
6256 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 /* Copy everything else as-is */
6259 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 *p++ = (char) ch;
6261 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006262 size = p - q;
6263
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006264 assert(size > 0);
6265 if (_PyBytes_Resize(&repr, size) < 0)
6266 return NULL;
6267 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268}
6269
Alexander Belopolsky40018472011-02-26 01:02:56 +00006270PyObject *
6271PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006273 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006275 PyErr_BadArgument();
6276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006278 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6279 PyUnicode_GET_SIZE(unicode));
6280
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006281 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282}
6283
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006284/* --- Unicode Internal Codec ------------------------------------------- */
6285
Alexander Belopolsky40018472011-02-26 01:02:56 +00006286PyObject *
6287_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006288 Py_ssize_t size,
6289 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290{
6291 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006292 Py_ssize_t startinpos;
6293 Py_ssize_t endinpos;
6294 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006295 PyUnicodeObject *v;
6296 Py_UNICODE *p;
6297 const char *end;
6298 const char *reason;
6299 PyObject *errorHandler = NULL;
6300 PyObject *exc = NULL;
6301
Neal Norwitzd43069c2006-01-08 01:12:10 +00006302#ifdef Py_UNICODE_WIDE
6303 Py_UNICODE unimax = PyUnicode_GetMax();
6304#endif
6305
Thomas Wouters89f507f2006-12-13 04:49:30 +00006306 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006307 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6308 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006310 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6311 as string was created with the old API. */
6312 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006314 p = PyUnicode_AS_UNICODE(v);
6315 end = s + size;
6316
6317 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006318 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319 /* We have to sanity check the raw data, otherwise doom looms for
6320 some malformed UCS-4 data. */
6321 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006322#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006324#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325 end-s < Py_UNICODE_SIZE
6326 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006328 startinpos = s - starts;
6329 if (end-s < Py_UNICODE_SIZE) {
6330 endinpos = end-starts;
6331 reason = "truncated input";
6332 }
6333 else {
6334 endinpos = s - starts + Py_UNICODE_SIZE;
6335 reason = "illegal code point (> 0x10FFFF)";
6336 }
6337 outpos = p - PyUnicode_AS_UNICODE(v);
6338 if (unicode_decode_call_errorhandler(
6339 errors, &errorHandler,
6340 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006341 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006342 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006343 goto onError;
6344 }
6345 }
6346 else {
6347 p++;
6348 s += Py_UNICODE_SIZE;
6349 }
6350 }
6351
Victor Stinnerfe226c02011-10-03 03:52:20 +02006352 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006353 goto onError;
6354 Py_XDECREF(errorHandler);
6355 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006356#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006357 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006358 Py_DECREF(v);
6359 return NULL;
6360 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006361#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006362 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006363 return (PyObject *)v;
6364
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006366 Py_XDECREF(v);
6367 Py_XDECREF(errorHandler);
6368 Py_XDECREF(exc);
6369 return NULL;
6370}
6371
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372/* --- Latin-1 Codec ------------------------------------------------------ */
6373
Alexander Belopolsky40018472011-02-26 01:02:56 +00006374PyObject *
6375PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006376 Py_ssize_t size,
6377 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006380 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381}
6382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006384static void
6385make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006386 const char *encoding,
6387 const Py_UNICODE *unicode, Py_ssize_t size,
6388 Py_ssize_t startpos, Py_ssize_t endpos,
6389 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 *exceptionObject = PyUnicodeEncodeError_Create(
6393 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 }
6395 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6397 goto onError;
6398 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6399 goto onError;
6400 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6401 goto onError;
6402 return;
6403 onError:
6404 Py_DECREF(*exceptionObject);
6405 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 }
6407}
6408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006410static void
6411raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006412 const char *encoding,
6413 const Py_UNICODE *unicode, Py_ssize_t size,
6414 Py_ssize_t startpos, Py_ssize_t endpos,
6415 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416{
6417 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421}
6422
6423/* error handling callback helper:
6424 build arguments, call the callback and check the arguments,
6425 put the result into newpos and return the replacement string, which
6426 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427static PyObject *
6428unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006429 PyObject **errorHandler,
6430 const char *encoding, const char *reason,
6431 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6432 Py_ssize_t startpos, Py_ssize_t endpos,
6433 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006435 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436
6437 PyObject *restuple;
6438 PyObject *resunicode;
6439
6440 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 }
6445
6446 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450
6451 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006456 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 Py_DECREF(restuple);
6458 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006460 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 &resunicode, newpos)) {
6462 Py_DECREF(restuple);
6463 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006465 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6466 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6467 Py_DECREF(restuple);
6468 return NULL;
6469 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006472 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6474 Py_DECREF(restuple);
6475 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 Py_INCREF(resunicode);
6478 Py_DECREF(restuple);
6479 return resunicode;
6480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482static PyObject *
6483unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006484 Py_ssize_t size,
6485 const char *errors,
6486 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487{
6488 /* output object */
6489 PyObject *res;
6490 /* pointers to the beginning and end+1 of input */
6491 const Py_UNICODE *startp = p;
6492 const Py_UNICODE *endp = p + size;
6493 /* pointer to the beginning of the unencodable characters */
6494 /* const Py_UNICODE *badp = NULL; */
6495 /* pointer into the output */
6496 char *str;
6497 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006498 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006499 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6500 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006501 PyObject *errorHandler = NULL;
6502 PyObject *exc = NULL;
6503 /* the following variable is used for caching string comparisons
6504 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6505 int known_errorHandler = -1;
6506
6507 /* allocate enough for a simple encoding without
6508 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006509 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006510 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006511 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006512 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006513 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006514 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006515 ressize = size;
6516
6517 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 /* can we encode this? */
6521 if (c<limit) {
6522 /* no overflow check, because we know that the space is enough */
6523 *str++ = (char)c;
6524 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006525 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 else {
6527 Py_ssize_t unicodepos = p-startp;
6528 Py_ssize_t requiredsize;
6529 PyObject *repunicode;
6530 Py_ssize_t repsize;
6531 Py_ssize_t newpos;
6532 Py_ssize_t respos;
6533 Py_UNICODE *uni2;
6534 /* startpos for collecting unencodable chars */
6535 const Py_UNICODE *collstart = p;
6536 const Py_UNICODE *collend = p;
6537 /* find all unecodable characters */
6538 while ((collend < endp) && ((*collend)>=limit))
6539 ++collend;
6540 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6541 if (known_errorHandler==-1) {
6542 if ((errors==NULL) || (!strcmp(errors, "strict")))
6543 known_errorHandler = 1;
6544 else if (!strcmp(errors, "replace"))
6545 known_errorHandler = 2;
6546 else if (!strcmp(errors, "ignore"))
6547 known_errorHandler = 3;
6548 else if (!strcmp(errors, "xmlcharrefreplace"))
6549 known_errorHandler = 4;
6550 else
6551 known_errorHandler = 0;
6552 }
6553 switch (known_errorHandler) {
6554 case 1: /* strict */
6555 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6556 goto onError;
6557 case 2: /* replace */
6558 while (collstart++<collend)
6559 *str++ = '?'; /* fall through */
6560 case 3: /* ignore */
6561 p = collend;
6562 break;
6563 case 4: /* xmlcharrefreplace */
6564 respos = str - PyBytes_AS_STRING(res);
6565 /* determine replacement size (temporarily (mis)uses p) */
6566 for (p = collstart, repsize = 0; p < collend; ++p) {
6567 if (*p<10)
6568 repsize += 2+1+1;
6569 else if (*p<100)
6570 repsize += 2+2+1;
6571 else if (*p<1000)
6572 repsize += 2+3+1;
6573 else if (*p<10000)
6574 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006575#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 else
6577 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006578#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 else if (*p<100000)
6580 repsize += 2+5+1;
6581 else if (*p<1000000)
6582 repsize += 2+6+1;
6583 else
6584 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006585#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 }
6587 requiredsize = respos+repsize+(endp-collend);
6588 if (requiredsize > ressize) {
6589 if (requiredsize<2*ressize)
6590 requiredsize = 2*ressize;
6591 if (_PyBytes_Resize(&res, requiredsize))
6592 goto onError;
6593 str = PyBytes_AS_STRING(res) + respos;
6594 ressize = requiredsize;
6595 }
6596 /* generate replacement (temporarily (mis)uses p) */
6597 for (p = collstart; p < collend; ++p) {
6598 str += sprintf(str, "&#%d;", (int)*p);
6599 }
6600 p = collend;
6601 break;
6602 default:
6603 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6604 encoding, reason, startp, size, &exc,
6605 collstart-startp, collend-startp, &newpos);
6606 if (repunicode == NULL)
6607 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006608 if (PyBytes_Check(repunicode)) {
6609 /* Directly copy bytes result to output. */
6610 repsize = PyBytes_Size(repunicode);
6611 if (repsize > 1) {
6612 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006613 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006614 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6615 Py_DECREF(repunicode);
6616 goto onError;
6617 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006618 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006619 ressize += repsize-1;
6620 }
6621 memcpy(str, PyBytes_AsString(repunicode), repsize);
6622 str += repsize;
6623 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006624 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006625 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006626 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 /* need more space? (at least enough for what we
6628 have+the replacement+the rest of the string, so
6629 we won't have to check space for encodable characters) */
6630 respos = str - PyBytes_AS_STRING(res);
6631 repsize = PyUnicode_GET_SIZE(repunicode);
6632 requiredsize = respos+repsize+(endp-collend);
6633 if (requiredsize > ressize) {
6634 if (requiredsize<2*ressize)
6635 requiredsize = 2*ressize;
6636 if (_PyBytes_Resize(&res, requiredsize)) {
6637 Py_DECREF(repunicode);
6638 goto onError;
6639 }
6640 str = PyBytes_AS_STRING(res) + respos;
6641 ressize = requiredsize;
6642 }
6643 /* check if there is anything unencodable in the replacement
6644 and copy it to the output */
6645 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6646 c = *uni2;
6647 if (c >= limit) {
6648 raise_encode_exception(&exc, encoding, startp, size,
6649 unicodepos, unicodepos+1, reason);
6650 Py_DECREF(repunicode);
6651 goto onError;
6652 }
6653 *str = (char)c;
6654 }
6655 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006656 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006657 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006658 }
6659 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006660 /* Resize if we allocated to much */
6661 size = str - PyBytes_AS_STRING(res);
6662 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006663 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006664 if (_PyBytes_Resize(&res, size) < 0)
6665 goto onError;
6666 }
6667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 Py_XDECREF(errorHandler);
6669 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006670 return res;
6671
6672 onError:
6673 Py_XDECREF(res);
6674 Py_XDECREF(errorHandler);
6675 Py_XDECREF(exc);
6676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677}
6678
Alexander Belopolsky40018472011-02-26 01:02:56 +00006679PyObject *
6680PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006681 Py_ssize_t size,
6682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685}
6686
Alexander Belopolsky40018472011-02-26 01:02:56 +00006687PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006688_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
6690 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 PyErr_BadArgument();
6692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006694 if (PyUnicode_READY(unicode) == -1)
6695 return NULL;
6696 /* Fast path: if it is a one-byte string, construct
6697 bytes object directly. */
6698 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6699 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6700 PyUnicode_GET_LENGTH(unicode));
6701 /* Non-Latin-1 characters present. Defer to above function to
6702 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006705 errors);
6706}
6707
6708PyObject*
6709PyUnicode_AsLatin1String(PyObject *unicode)
6710{
6711 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712}
6713
6714/* --- 7-bit ASCII Codec -------------------------------------------------- */
6715
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716PyObject *
6717PyUnicode_DecodeASCII(const char *s,
6718 Py_ssize_t size,
6719 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006723 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006724 Py_ssize_t startinpos;
6725 Py_ssize_t endinpos;
6726 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006728 int has_error;
6729 const unsigned char *p = (const unsigned char *)s;
6730 const unsigned char *end = p + size;
6731 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 PyObject *errorHandler = NULL;
6733 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006736 if (size == 1 && (unsigned char)s[0] < 128)
6737 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738
Victor Stinner702c7342011-10-05 13:50:52 +02006739 has_error = 0;
6740 while (p < end && !has_error) {
6741 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6742 an explanation. */
6743 if (!((size_t) p & LONG_PTR_MASK)) {
6744 /* Help register allocation */
6745 register const unsigned char *_p = p;
6746 while (_p < aligned_end) {
6747 unsigned long value = *(unsigned long *) _p;
6748 if (value & ASCII_CHAR_MASK) {
6749 has_error = 1;
6750 break;
6751 }
6752 _p += SIZEOF_LONG;
6753 }
6754 if (_p == end)
6755 break;
6756 if (has_error)
6757 break;
6758 p = _p;
6759 }
6760 if (*p & 0x80) {
6761 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006762 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006763 }
6764 else {
6765 ++p;
6766 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006767 }
Victor Stinner702c7342011-10-05 13:50:52 +02006768 if (!has_error)
6769 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006770
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 v = _PyUnicode_New(size);
6772 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006776 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 e = s + size;
6778 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 register unsigned char c = (unsigned char)*s;
6780 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006781 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 ++s;
6783 }
6784 else {
6785 startinpos = s-starts;
6786 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006787 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 if (unicode_decode_call_errorhandler(
6789 errors, &errorHandler,
6790 "ascii", "ordinal not in range(128)",
6791 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006792 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 goto onError;
6794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 }
Victor Stinner702c7342011-10-05 13:50:52 +02006796 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6797 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006799 Py_XDECREF(errorHandler);
6800 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006801#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006802 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006803 Py_DECREF(v);
6804 return NULL;
6805 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006806#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006807 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006809
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006812 Py_XDECREF(errorHandler);
6813 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 return NULL;
6815}
6816
Alexander Belopolsky40018472011-02-26 01:02:56 +00006817PyObject *
6818PyUnicode_EncodeASCII(const Py_UNICODE *p,
6819 Py_ssize_t size,
6820 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Alexander Belopolsky40018472011-02-26 01:02:56 +00006825PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006826_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827{
6828 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 PyErr_BadArgument();
6830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006832 if (PyUnicode_READY(unicode) == -1)
6833 return NULL;
6834 /* Fast path: if it is an ASCII-only string, construct bytes object
6835 directly. Else defer to above function to raise the exception. */
6836 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6837 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6838 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006841 errors);
6842}
6843
6844PyObject *
6845PyUnicode_AsASCIIString(PyObject *unicode)
6846{
6847 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848}
6849
Victor Stinner99b95382011-07-04 14:23:54 +02006850#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006851
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006852/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006853
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006854#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855#define NEED_RETRY
6856#endif
6857
6858/* XXX This code is limited to "true" double-byte encodings, as
6859 a) it assumes an incomplete character consists of a single byte, and
6860 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862
Alexander Belopolsky40018472011-02-26 01:02:56 +00006863static int
6864is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006865{
6866 const char *curr = s + offset;
6867
6868 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 const char *prev = CharPrev(s, curr);
6870 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871 }
6872 return 0;
6873}
6874
6875/*
6876 * Decode MBCS string into unicode object. If 'final' is set, converts
6877 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6878 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006879static int
6880decode_mbcs(PyUnicodeObject **v,
6881 const char *s, /* MBCS string */
6882 int size, /* sizeof MBCS string */
6883 int final,
6884 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885{
6886 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006887 Py_ssize_t n;
6888 DWORD usize;
6889 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006890
6891 assert(size >= 0);
6892
Victor Stinner554f3f02010-06-16 23:33:54 +00006893 /* check and handle 'errors' arg */
6894 if (errors==NULL || strcmp(errors, "strict")==0)
6895 flags = MB_ERR_INVALID_CHARS;
6896 else if (strcmp(errors, "ignore")==0)
6897 flags = 0;
6898 else {
6899 PyErr_Format(PyExc_ValueError,
6900 "mbcs encoding does not support errors='%s'",
6901 errors);
6902 return -1;
6903 }
6904
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905 /* Skip trailing lead-byte unless 'final' is set */
6906 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908
6909 /* First get the size of the result */
6910 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006911 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6912 if (usize==0)
6913 goto mbcs_decode_error;
6914 } else
6915 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006916
6917 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 /* Create unicode object */
6919 *v = _PyUnicode_New(usize);
6920 if (*v == NULL)
6921 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006922 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006923 }
6924 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 /* Extend unicode object */
6926 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006927 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006929 }
6930
6931 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006932 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006934 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6935 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006937 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006939
6940mbcs_decode_error:
6941 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6942 we raise a UnicodeDecodeError - else it is a 'generic'
6943 windows error
6944 */
6945 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6946 /* Ideally, we should get reason from FormatMessage - this
6947 is the Windows 2000 English version of the message
6948 */
6949 PyObject *exc = NULL;
6950 const char *reason = "No mapping for the Unicode character exists "
6951 "in the target multi-byte code page.";
6952 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6953 if (exc != NULL) {
6954 PyCodec_StrictErrors(exc);
6955 Py_DECREF(exc);
6956 }
6957 } else {
6958 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6959 }
6960 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961}
6962
Alexander Belopolsky40018472011-02-26 01:02:56 +00006963PyObject *
6964PyUnicode_DecodeMBCSStateful(const char *s,
6965 Py_ssize_t size,
6966 const char *errors,
6967 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968{
6969 PyUnicodeObject *v = NULL;
6970 int done;
6971
6972 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974
6975#ifdef NEED_RETRY
6976 retry:
6977 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006978 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979 else
6980#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006981 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982
6983 if (done < 0) {
6984 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986 }
6987
6988 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006990
6991#ifdef NEED_RETRY
6992 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 s += done;
6994 size -= done;
6995 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996 }
6997#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006998#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006999 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007000 Py_DECREF(v);
7001 return NULL;
7002 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007003#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007004 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007005 return (PyObject *)v;
7006}
7007
Alexander Belopolsky40018472011-02-26 01:02:56 +00007008PyObject *
7009PyUnicode_DecodeMBCS(const char *s,
7010 Py_ssize_t size,
7011 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007012{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007013 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7014}
7015
7016/*
7017 * Convert unicode into string object (MBCS).
7018 * Returns 0 if succeed, -1 otherwise.
7019 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007020static int
7021encode_mbcs(PyObject **repr,
7022 const Py_UNICODE *p, /* unicode */
7023 int size, /* size of unicode */
7024 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007025{
Victor Stinner554f3f02010-06-16 23:33:54 +00007026 BOOL usedDefaultChar = FALSE;
7027 BOOL *pusedDefaultChar;
7028 int mbcssize;
7029 Py_ssize_t n;
7030 PyObject *exc = NULL;
7031 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
7033 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007034
Victor Stinner554f3f02010-06-16 23:33:54 +00007035 /* check and handle 'errors' arg */
7036 if (errors==NULL || strcmp(errors, "strict")==0) {
7037 flags = WC_NO_BEST_FIT_CHARS;
7038 pusedDefaultChar = &usedDefaultChar;
7039 } else if (strcmp(errors, "replace")==0) {
7040 flags = 0;
7041 pusedDefaultChar = NULL;
7042 } else {
7043 PyErr_Format(PyExc_ValueError,
7044 "mbcs encoding does not support errors='%s'",
7045 errors);
7046 return -1;
7047 }
7048
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007049 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007051 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7052 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 if (mbcssize == 0) {
7054 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7055 return -1;
7056 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007057 /* If we used a default char, then we failed! */
7058 if (pusedDefaultChar && *pusedDefaultChar)
7059 goto mbcs_encode_error;
7060 } else {
7061 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007062 }
7063
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 /* Create string object */
7066 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7067 if (*repr == NULL)
7068 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007069 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070 }
7071 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 /* Extend string object */
7073 n = PyBytes_Size(*repr);
7074 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7075 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 }
7077
7078 /* Do the conversion */
7079 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007081 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7082 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7084 return -1;
7085 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007086 if (pusedDefaultChar && *pusedDefaultChar)
7087 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007090
7091mbcs_encode_error:
7092 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7093 Py_XDECREF(exc);
7094 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007095}
7096
Alexander Belopolsky40018472011-02-26 01:02:56 +00007097PyObject *
7098PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7099 Py_ssize_t size,
7100 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007101{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102 PyObject *repr = NULL;
7103 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007104
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007108 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109 else
7110#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007111 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007112
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 Py_XDECREF(repr);
7115 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007116 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117
7118#ifdef NEED_RETRY
7119 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 p += INT_MAX;
7121 size -= INT_MAX;
7122 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123 }
7124#endif
7125
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007126 return repr;
7127}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007128
Alexander Belopolsky40018472011-02-26 01:02:56 +00007129PyObject *
7130PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007131{
7132 if (!PyUnicode_Check(unicode)) {
7133 PyErr_BadArgument();
7134 return NULL;
7135 }
7136 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 PyUnicode_GET_SIZE(unicode),
7138 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007139}
7140
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141#undef NEED_RETRY
7142
Victor Stinner99b95382011-07-04 14:23:54 +02007143#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007144
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145/* --- Character Mapping Codec -------------------------------------------- */
7146
Alexander Belopolsky40018472011-02-26 01:02:56 +00007147PyObject *
7148PyUnicode_DecodeCharmap(const char *s,
7149 Py_ssize_t size,
7150 PyObject *mapping,
7151 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007153 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007154 Py_ssize_t startinpos;
7155 Py_ssize_t endinpos;
7156 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007157 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 PyUnicodeObject *v;
7159 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007160 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161 PyObject *errorHandler = NULL;
7162 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007163 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007164 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007165
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 /* Default to Latin-1 */
7167 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
7170 v = _PyUnicode_New(size);
7171 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007176 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007177 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 mapstring = PyUnicode_AS_UNICODE(mapping);
7179 maplen = PyUnicode_GET_SIZE(mapping);
7180 while (s < e) {
7181 unsigned char ch = *s;
7182 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 if (ch < maplen)
7185 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 if (x == 0xfffe) {
7188 /* undefined mapping */
7189 outpos = p-PyUnicode_AS_UNICODE(v);
7190 startinpos = s-starts;
7191 endinpos = startinpos+1;
7192 if (unicode_decode_call_errorhandler(
7193 errors, &errorHandler,
7194 "charmap", "character maps to <undefined>",
7195 &starts, &e, &startinpos, &endinpos, &exc, &s,
7196 &v, &outpos, &p)) {
7197 goto onError;
7198 }
7199 continue;
7200 }
7201 *p++ = x;
7202 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007203 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007204 }
7205 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 while (s < e) {
7207 unsigned char ch = *s;
7208 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007209
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7211 w = PyLong_FromLong((long)ch);
7212 if (w == NULL)
7213 goto onError;
7214 x = PyObject_GetItem(mapping, w);
7215 Py_DECREF(w);
7216 if (x == NULL) {
7217 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7218 /* No mapping found means: mapping is undefined. */
7219 PyErr_Clear();
7220 x = Py_None;
7221 Py_INCREF(x);
7222 } else
7223 goto onError;
7224 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007225
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 /* Apply mapping */
7227 if (PyLong_Check(x)) {
7228 long value = PyLong_AS_LONG(x);
7229 if (value < 0 || value > 65535) {
7230 PyErr_SetString(PyExc_TypeError,
7231 "character mapping must be in range(65536)");
7232 Py_DECREF(x);
7233 goto onError;
7234 }
7235 *p++ = (Py_UNICODE)value;
7236 }
7237 else if (x == Py_None) {
7238 /* undefined mapping */
7239 outpos = p-PyUnicode_AS_UNICODE(v);
7240 startinpos = s-starts;
7241 endinpos = startinpos+1;
7242 if (unicode_decode_call_errorhandler(
7243 errors, &errorHandler,
7244 "charmap", "character maps to <undefined>",
7245 &starts, &e, &startinpos, &endinpos, &exc, &s,
7246 &v, &outpos, &p)) {
7247 Py_DECREF(x);
7248 goto onError;
7249 }
7250 Py_DECREF(x);
7251 continue;
7252 }
7253 else if (PyUnicode_Check(x)) {
7254 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007255
Benjamin Peterson29060642009-01-31 22:14:21 +00007256 if (targetsize == 1)
7257 /* 1-1 mapping */
7258 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007259
Benjamin Peterson29060642009-01-31 22:14:21 +00007260 else if (targetsize > 1) {
7261 /* 1-n mapping */
7262 if (targetsize > extrachars) {
7263 /* resize first */
7264 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7265 Py_ssize_t needed = (targetsize - extrachars) + \
7266 (targetsize << 2);
7267 extrachars += needed;
7268 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007269 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 PyUnicode_GET_SIZE(v) + needed) < 0) {
7271 Py_DECREF(x);
7272 goto onError;
7273 }
7274 p = PyUnicode_AS_UNICODE(v) + oldpos;
7275 }
7276 Py_UNICODE_COPY(p,
7277 PyUnicode_AS_UNICODE(x),
7278 targetsize);
7279 p += targetsize;
7280 extrachars -= targetsize;
7281 }
7282 /* 1-0 mapping: skip the character */
7283 }
7284 else {
7285 /* wrong return value */
7286 PyErr_SetString(PyExc_TypeError,
7287 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007288 Py_DECREF(x);
7289 goto onError;
7290 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 Py_DECREF(x);
7292 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 }
7295 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007296 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007298 Py_XDECREF(errorHandler);
7299 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007300#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007301 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007302 Py_DECREF(v);
7303 return NULL;
7304 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007305#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007306 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007308
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007310 Py_XDECREF(errorHandler);
7311 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 Py_XDECREF(v);
7313 return NULL;
7314}
7315
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007316/* Charmap encoding: the lookup table */
7317
Alexander Belopolsky40018472011-02-26 01:02:56 +00007318struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 PyObject_HEAD
7320 unsigned char level1[32];
7321 int count2, count3;
7322 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007323};
7324
7325static PyObject*
7326encoding_map_size(PyObject *obj, PyObject* args)
7327{
7328 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007329 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007331}
7332
7333static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007334 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 PyDoc_STR("Return the size (in bytes) of this object") },
7336 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007337};
7338
7339static void
7340encoding_map_dealloc(PyObject* o)
7341{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007342 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007343}
7344
7345static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 "EncodingMap", /*tp_name*/
7348 sizeof(struct encoding_map), /*tp_basicsize*/
7349 0, /*tp_itemsize*/
7350 /* methods */
7351 encoding_map_dealloc, /*tp_dealloc*/
7352 0, /*tp_print*/
7353 0, /*tp_getattr*/
7354 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007355 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 0, /*tp_repr*/
7357 0, /*tp_as_number*/
7358 0, /*tp_as_sequence*/
7359 0, /*tp_as_mapping*/
7360 0, /*tp_hash*/
7361 0, /*tp_call*/
7362 0, /*tp_str*/
7363 0, /*tp_getattro*/
7364 0, /*tp_setattro*/
7365 0, /*tp_as_buffer*/
7366 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7367 0, /*tp_doc*/
7368 0, /*tp_traverse*/
7369 0, /*tp_clear*/
7370 0, /*tp_richcompare*/
7371 0, /*tp_weaklistoffset*/
7372 0, /*tp_iter*/
7373 0, /*tp_iternext*/
7374 encoding_map_methods, /*tp_methods*/
7375 0, /*tp_members*/
7376 0, /*tp_getset*/
7377 0, /*tp_base*/
7378 0, /*tp_dict*/
7379 0, /*tp_descr_get*/
7380 0, /*tp_descr_set*/
7381 0, /*tp_dictoffset*/
7382 0, /*tp_init*/
7383 0, /*tp_alloc*/
7384 0, /*tp_new*/
7385 0, /*tp_free*/
7386 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007387};
7388
7389PyObject*
7390PyUnicode_BuildEncodingMap(PyObject* string)
7391{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007392 PyObject *result;
7393 struct encoding_map *mresult;
7394 int i;
7395 int need_dict = 0;
7396 unsigned char level1[32];
7397 unsigned char level2[512];
7398 unsigned char *mlevel1, *mlevel2, *mlevel3;
7399 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007400 int kind;
7401 void *data;
7402 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007404 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007405 PyErr_BadArgument();
7406 return NULL;
7407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007408 kind = PyUnicode_KIND(string);
7409 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007410 memset(level1, 0xFF, sizeof level1);
7411 memset(level2, 0xFF, sizeof level2);
7412
7413 /* If there isn't a one-to-one mapping of NULL to \0,
7414 or if there are non-BMP characters, we need to use
7415 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007416 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007417 need_dict = 1;
7418 for (i = 1; i < 256; i++) {
7419 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 ch = PyUnicode_READ(kind, data, i);
7421 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007422 need_dict = 1;
7423 break;
7424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007425 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007426 /* unmapped character */
7427 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007428 l1 = ch >> 11;
7429 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007430 if (level1[l1] == 0xFF)
7431 level1[l1] = count2++;
7432 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007433 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007434 }
7435
7436 if (count2 >= 0xFF || count3 >= 0xFF)
7437 need_dict = 1;
7438
7439 if (need_dict) {
7440 PyObject *result = PyDict_New();
7441 PyObject *key, *value;
7442 if (!result)
7443 return NULL;
7444 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007446 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007447 if (!key || !value)
7448 goto failed1;
7449 if (PyDict_SetItem(result, key, value) == -1)
7450 goto failed1;
7451 Py_DECREF(key);
7452 Py_DECREF(value);
7453 }
7454 return result;
7455 failed1:
7456 Py_XDECREF(key);
7457 Py_XDECREF(value);
7458 Py_DECREF(result);
7459 return NULL;
7460 }
7461
7462 /* Create a three-level trie */
7463 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7464 16*count2 + 128*count3 - 1);
7465 if (!result)
7466 return PyErr_NoMemory();
7467 PyObject_Init(result, &EncodingMapType);
7468 mresult = (struct encoding_map*)result;
7469 mresult->count2 = count2;
7470 mresult->count3 = count3;
7471 mlevel1 = mresult->level1;
7472 mlevel2 = mresult->level23;
7473 mlevel3 = mresult->level23 + 16*count2;
7474 memcpy(mlevel1, level1, 32);
7475 memset(mlevel2, 0xFF, 16*count2);
7476 memset(mlevel3, 0, 128*count3);
7477 count3 = 0;
7478 for (i = 1; i < 256; i++) {
7479 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007480 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007481 /* unmapped character */
7482 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007483 o1 = PyUnicode_READ(kind, data, i)>>11;
7484 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007485 i2 = 16*mlevel1[o1] + o2;
7486 if (mlevel2[i2] == 0xFF)
7487 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007488 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007489 i3 = 128*mlevel2[i2] + o3;
7490 mlevel3[i3] = i;
7491 }
7492 return result;
7493}
7494
7495static int
7496encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7497{
7498 struct encoding_map *map = (struct encoding_map*)mapping;
7499 int l1 = c>>11;
7500 int l2 = (c>>7) & 0xF;
7501 int l3 = c & 0x7F;
7502 int i;
7503
7504#ifdef Py_UNICODE_WIDE
7505 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007507 }
7508#endif
7509 if (c == 0)
7510 return 0;
7511 /* level 1*/
7512 i = map->level1[l1];
7513 if (i == 0xFF) {
7514 return -1;
7515 }
7516 /* level 2*/
7517 i = map->level23[16*i+l2];
7518 if (i == 0xFF) {
7519 return -1;
7520 }
7521 /* level 3 */
7522 i = map->level23[16*map->count2 + 128*i + l3];
7523 if (i == 0) {
7524 return -1;
7525 }
7526 return i;
7527}
7528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007529/* Lookup the character ch in the mapping. If the character
7530 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007531 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007532static PyObject *
7533charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534{
Christian Heimes217cfd12007-12-02 14:31:20 +00007535 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007536 PyObject *x;
7537
7538 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007540 x = PyObject_GetItem(mapping, w);
7541 Py_DECREF(w);
7542 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7544 /* No mapping found means: mapping is undefined. */
7545 PyErr_Clear();
7546 x = Py_None;
7547 Py_INCREF(x);
7548 return x;
7549 } else
7550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007552 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007554 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 long value = PyLong_AS_LONG(x);
7556 if (value < 0 || value > 255) {
7557 PyErr_SetString(PyExc_TypeError,
7558 "character mapping must be in range(256)");
7559 Py_DECREF(x);
7560 return NULL;
7561 }
7562 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007564 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 /* wrong return value */
7568 PyErr_Format(PyExc_TypeError,
7569 "character mapping must return integer, bytes or None, not %.400s",
7570 x->ob_type->tp_name);
7571 Py_DECREF(x);
7572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 }
7574}
7575
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007576static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007577charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007578{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007579 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7580 /* exponentially overallocate to minimize reallocations */
7581 if (requiredsize < 2*outsize)
7582 requiredsize = 2*outsize;
7583 if (_PyBytes_Resize(outobj, requiredsize))
7584 return -1;
7585 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007586}
7587
Benjamin Peterson14339b62009-01-31 16:36:08 +00007588typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007590} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007592 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007593 space is available. Return a new reference to the object that
7594 was put in the output buffer, or Py_None, if the mapping was undefined
7595 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007596 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007597static charmapencode_result
7598charmapencode_output(Py_UNICODE c, PyObject *mapping,
7599 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007600{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007601 PyObject *rep;
7602 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007603 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007604
Christian Heimes90aa7642007-12-19 02:45:37 +00007605 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007606 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007608 if (res == -1)
7609 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 if (outsize<requiredsize)
7611 if (charmapencode_resize(outobj, outpos, requiredsize))
7612 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007613 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 outstart[(*outpos)++] = (char)res;
7615 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007616 }
7617
7618 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007619 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 Py_DECREF(rep);
7623 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007624 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 if (PyLong_Check(rep)) {
7626 Py_ssize_t requiredsize = *outpos+1;
7627 if (outsize<requiredsize)
7628 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7629 Py_DECREF(rep);
7630 return enc_EXCEPTION;
7631 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007632 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 else {
7636 const char *repchars = PyBytes_AS_STRING(rep);
7637 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7638 Py_ssize_t requiredsize = *outpos+repsize;
7639 if (outsize<requiredsize)
7640 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7641 Py_DECREF(rep);
7642 return enc_EXCEPTION;
7643 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007644 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 memcpy(outstart + *outpos, repchars, repsize);
7646 *outpos += repsize;
7647 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007648 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007649 Py_DECREF(rep);
7650 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007651}
7652
7653/* handle an error in PyUnicode_EncodeCharmap
7654 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007655static int
7656charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007657 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007659 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007660 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661{
7662 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007663 Py_ssize_t repsize;
7664 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665 Py_UNICODE *uni2;
7666 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007667 Py_ssize_t collstartpos = *inpos;
7668 Py_ssize_t collendpos = *inpos+1;
7669 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007670 char *encoding = "charmap";
7671 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007672 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007674 /* find all unencodable characters */
7675 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007677 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 int res = encoding_map_lookup(p[collendpos], mapping);
7679 if (res != -1)
7680 break;
7681 ++collendpos;
7682 continue;
7683 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007684
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 rep = charmapencode_lookup(p[collendpos], mapping);
7686 if (rep==NULL)
7687 return -1;
7688 else if (rep!=Py_None) {
7689 Py_DECREF(rep);
7690 break;
7691 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007692 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 }
7695 /* cache callback name lookup
7696 * (if not done yet, i.e. it's the first error) */
7697 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 if ((errors==NULL) || (!strcmp(errors, "strict")))
7699 *known_errorHandler = 1;
7700 else if (!strcmp(errors, "replace"))
7701 *known_errorHandler = 2;
7702 else if (!strcmp(errors, "ignore"))
7703 *known_errorHandler = 3;
7704 else if (!strcmp(errors, "xmlcharrefreplace"))
7705 *known_errorHandler = 4;
7706 else
7707 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708 }
7709 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007710 case 1: /* strict */
7711 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7712 return -1;
7713 case 2: /* replace */
7714 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 x = charmapencode_output('?', mapping, res, respos);
7716 if (x==enc_EXCEPTION) {
7717 return -1;
7718 }
7719 else if (x==enc_FAILED) {
7720 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7721 return -1;
7722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007723 }
7724 /* fall through */
7725 case 3: /* ignore */
7726 *inpos = collendpos;
7727 break;
7728 case 4: /* xmlcharrefreplace */
7729 /* generate replacement (temporarily (mis)uses p) */
7730 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 char buffer[2+29+1+1];
7732 char *cp;
7733 sprintf(buffer, "&#%d;", (int)p[collpos]);
7734 for (cp = buffer; *cp; ++cp) {
7735 x = charmapencode_output(*cp, mapping, res, respos);
7736 if (x==enc_EXCEPTION)
7737 return -1;
7738 else if (x==enc_FAILED) {
7739 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7740 return -1;
7741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007742 }
7743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 *inpos = collendpos;
7745 break;
7746 default:
7747 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 encoding, reason, p, size, exceptionObject,
7749 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007750 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007752 if (PyBytes_Check(repunicode)) {
7753 /* Directly copy bytes result to output. */
7754 Py_ssize_t outsize = PyBytes_Size(*res);
7755 Py_ssize_t requiredsize;
7756 repsize = PyBytes_Size(repunicode);
7757 requiredsize = *respos + repsize;
7758 if (requiredsize > outsize)
7759 /* Make room for all additional bytes. */
7760 if (charmapencode_resize(res, respos, requiredsize)) {
7761 Py_DECREF(repunicode);
7762 return -1;
7763 }
7764 memcpy(PyBytes_AsString(*res) + *respos,
7765 PyBytes_AsString(repunicode), repsize);
7766 *respos += repsize;
7767 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007768 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007769 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 /* generate replacement */
7772 repsize = PyUnicode_GET_SIZE(repunicode);
7773 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 x = charmapencode_output(*uni2, mapping, res, respos);
7775 if (x==enc_EXCEPTION) {
7776 return -1;
7777 }
7778 else if (x==enc_FAILED) {
7779 Py_DECREF(repunicode);
7780 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7781 return -1;
7782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007783 }
7784 *inpos = newpos;
7785 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 }
7787 return 0;
7788}
7789
Alexander Belopolsky40018472011-02-26 01:02:56 +00007790PyObject *
7791PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7792 Py_ssize_t size,
7793 PyObject *mapping,
7794 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796 /* output object */
7797 PyObject *res = NULL;
7798 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007800 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007802 PyObject *errorHandler = NULL;
7803 PyObject *exc = NULL;
7804 /* the following variable is used for caching string comparisons
7805 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7806 * 3=ignore, 4=xmlcharrefreplace */
7807 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
7809 /* Default to Latin-1 */
7810 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007813 /* allocate enough for a simple encoding without
7814 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007815 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816 if (res == NULL)
7817 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007818 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 /* try to encode it */
7823 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7824 if (x==enc_EXCEPTION) /* error */
7825 goto onError;
7826 if (x==enc_FAILED) { /* unencodable character */
7827 if (charmap_encoding_error(p, size, &inpos, mapping,
7828 &exc,
7829 &known_errorHandler, &errorHandler, errors,
7830 &res, &respos)) {
7831 goto onError;
7832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 else
7835 /* done with this character => adjust input position */
7836 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007840 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007841 if (_PyBytes_Resize(&res, respos) < 0)
7842 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007844 Py_XDECREF(exc);
7845 Py_XDECREF(errorHandler);
7846 return res;
7847
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007849 Py_XDECREF(res);
7850 Py_XDECREF(exc);
7851 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 return NULL;
7853}
7854
Alexander Belopolsky40018472011-02-26 01:02:56 +00007855PyObject *
7856PyUnicode_AsCharmapString(PyObject *unicode,
7857 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858{
7859 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 PyErr_BadArgument();
7861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 }
7863 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 PyUnicode_GET_SIZE(unicode),
7865 mapping,
7866 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867}
7868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007869/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007870static void
7871make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007872 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007873 Py_ssize_t startpos, Py_ssize_t endpos,
7874 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 *exceptionObject = _PyUnicodeTranslateError_Create(
7878 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 }
7880 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7882 goto onError;
7883 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7884 goto onError;
7885 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7886 goto onError;
7887 return;
7888 onError:
7889 Py_DECREF(*exceptionObject);
7890 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 }
7892}
7893
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007895static void
7896raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007897 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007898 Py_ssize_t startpos, Py_ssize_t endpos,
7899 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007900{
7901 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007903 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007905}
7906
7907/* error handling callback helper:
7908 build arguments, call the callback and check the arguments,
7909 put the result into newpos and return the replacement string, which
7910 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007911static PyObject *
7912unicode_translate_call_errorhandler(const char *errors,
7913 PyObject **errorHandler,
7914 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007916 Py_ssize_t startpos, Py_ssize_t endpos,
7917 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007918{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007919 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007920
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007921 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007922 PyObject *restuple;
7923 PyObject *resunicode;
7924
7925 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007927 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007929 }
7930
7931 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935
7936 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007940 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007941 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 Py_DECREF(restuple);
7943 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007944 }
7945 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 &resunicode, &i_newpos)) {
7947 Py_DECREF(restuple);
7948 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007951 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 else
7953 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007954 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7956 Py_DECREF(restuple);
7957 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959 Py_INCREF(resunicode);
7960 Py_DECREF(restuple);
7961 return resunicode;
7962}
7963
7964/* Lookup the character ch in the mapping and put the result in result,
7965 which must be decrefed by the caller.
7966 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007967static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007968charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969{
Christian Heimes217cfd12007-12-02 14:31:20 +00007970 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007971 PyObject *x;
7972
7973 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007975 x = PyObject_GetItem(mapping, w);
7976 Py_DECREF(w);
7977 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7979 /* No mapping found means: use 1:1 mapping. */
7980 PyErr_Clear();
7981 *result = NULL;
7982 return 0;
7983 } else
7984 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007985 }
7986 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 *result = x;
7988 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007990 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 long value = PyLong_AS_LONG(x);
7992 long max = PyUnicode_GetMax();
7993 if (value < 0 || value > max) {
7994 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007995 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 Py_DECREF(x);
7997 return -1;
7998 }
7999 *result = x;
8000 return 0;
8001 }
8002 else if (PyUnicode_Check(x)) {
8003 *result = x;
8004 return 0;
8005 }
8006 else {
8007 /* wrong return value */
8008 PyErr_SetString(PyExc_TypeError,
8009 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008010 Py_DECREF(x);
8011 return -1;
8012 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013}
8014/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 if not reallocate and adjust various state variables.
8016 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008017static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008018charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008021 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008022 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 /* exponentially overallocate to minimize reallocations */
8024 if (requiredsize < 2 * oldsize)
8025 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8027 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030 }
8031 return 0;
8032}
8033/* lookup the character, put the result in the output string and adjust
8034 various state variables. Return a new reference to the object that
8035 was put in the output buffer in *result, or Py_None, if the mapping was
8036 undefined (in which case no character was written).
8037 The called must decref result.
8038 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008039static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008040charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8041 PyObject *mapping, Py_UCS4 **output,
8042 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008043 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008045 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8046 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008048 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008051 }
8052 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008054 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008056 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008057 }
8058 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008059 Py_ssize_t repsize;
8060 if (PyUnicode_READY(*res) == -1)
8061 return -1;
8062 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 if (repsize==1) {
8064 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008065 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 }
8067 else if (repsize!=0) {
8068 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008069 Py_ssize_t requiredsize = *opos +
8070 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008072 Py_ssize_t i;
8073 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 for(i = 0; i < repsize; i++)
8076 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 }
8079 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081 return 0;
8082}
8083
Alexander Belopolsky40018472011-02-26 01:02:56 +00008084PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085_PyUnicode_TranslateCharmap(PyObject *input,
8086 PyObject *mapping,
8087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 /* input object */
8090 char *idata;
8091 Py_ssize_t size, i;
8092 int kind;
8093 /* output buffer */
8094 Py_UCS4 *output = NULL;
8095 Py_ssize_t osize;
8096 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 char *reason = "character maps to <undefined>";
8100 PyObject *errorHandler = NULL;
8101 PyObject *exc = NULL;
8102 /* the following variable is used for caching string comparisons
8103 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8104 * 3=ignore, 4=xmlcharrefreplace */
8105 int known_errorHandler = -1;
8106
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 PyErr_BadArgument();
8109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 if (PyUnicode_READY(input) == -1)
8113 return NULL;
8114 idata = (char*)PyUnicode_DATA(input);
8115 kind = PyUnicode_KIND(input);
8116 size = PyUnicode_GET_LENGTH(input);
8117 i = 0;
8118
8119 if (size == 0) {
8120 Py_INCREF(input);
8121 return input;
8122 }
8123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008124 /* allocate enough for a simple 1:1 translation without
8125 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 osize = size;
8127 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8128 opos = 0;
8129 if (output == NULL) {
8130 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 /* try to encode it */
8136 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 if (charmaptranslate_output(input, i, mapping,
8138 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 Py_XDECREF(x);
8140 goto onError;
8141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 else { /* untranslatable character */
8146 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8147 Py_ssize_t repsize;
8148 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 Py_ssize_t collstart = i;
8152 Py_ssize_t collend = i+1;
8153 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 while (collend < size) {
8157 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 goto onError;
8159 Py_XDECREF(x);
8160 if (x!=Py_None)
8161 break;
8162 ++collend;
8163 }
8164 /* cache callback name lookup
8165 * (if not done yet, i.e. it's the first error) */
8166 if (known_errorHandler==-1) {
8167 if ((errors==NULL) || (!strcmp(errors, "strict")))
8168 known_errorHandler = 1;
8169 else if (!strcmp(errors, "replace"))
8170 known_errorHandler = 2;
8171 else if (!strcmp(errors, "ignore"))
8172 known_errorHandler = 3;
8173 else if (!strcmp(errors, "xmlcharrefreplace"))
8174 known_errorHandler = 4;
8175 else
8176 known_errorHandler = 0;
8177 }
8178 switch (known_errorHandler) {
8179 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 raise_translate_exception(&exc, input, collstart,
8181 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 case 2: /* replace */
8184 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185 for (coll = collstart; coll<collend; coll++)
8186 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 /* fall through */
8188 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 break;
8191 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 /* generate replacement (temporarily (mis)uses i) */
8193 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 char buffer[2+29+1+1];
8195 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8197 if (charmaptranslate_makespace(&output, &osize,
8198 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 goto onError;
8200 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 break;
8205 default:
8206 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207 reason, input, &exc,
8208 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008209 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 goto onError;
8211 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 repsize = PyUnicode_GET_LENGTH(repunicode);
8213 if (charmaptranslate_makespace(&output, &osize,
8214 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 Py_DECREF(repunicode);
8216 goto onError;
8217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 for (uni2 = 0; repsize-->0; ++uni2)
8219 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8220 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 }
8224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8226 if (!res)
8227 goto onError;
8228 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 Py_XDECREF(exc);
8230 Py_XDECREF(errorHandler);
8231 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 Py_XDECREF(exc);
8236 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 return NULL;
8238}
8239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240/* Deprecated. Use PyUnicode_Translate instead. */
8241PyObject *
8242PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8243 Py_ssize_t size,
8244 PyObject *mapping,
8245 const char *errors)
8246{
8247 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8248 if (!unicode)
8249 return NULL;
8250 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8251}
8252
Alexander Belopolsky40018472011-02-26 01:02:56 +00008253PyObject *
8254PyUnicode_Translate(PyObject *str,
8255 PyObject *mapping,
8256 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257{
8258 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008259
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 str = PyUnicode_FromObject(str);
8261 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 Py_DECREF(str);
8265 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008266
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 Py_XDECREF(str);
8269 return NULL;
8270}
Tim Petersced69f82003-09-16 20:30:58 +00008271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008273fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274{
8275 /* No need to call PyUnicode_READY(self) because this function is only
8276 called as a callback from fixup() which does it already. */
8277 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8278 const int kind = PyUnicode_KIND(self);
8279 void *data = PyUnicode_DATA(self);
8280 Py_UCS4 maxchar = 0, ch, fixed;
8281 Py_ssize_t i;
8282
8283 for (i = 0; i < len; ++i) {
8284 ch = PyUnicode_READ(kind, data, i);
8285 fixed = 0;
8286 if (ch > 127) {
8287 if (Py_UNICODE_ISSPACE(ch))
8288 fixed = ' ';
8289 else {
8290 const int decimal = Py_UNICODE_TODECIMAL(ch);
8291 if (decimal >= 0)
8292 fixed = '0' + decimal;
8293 }
8294 if (fixed != 0) {
8295 if (fixed > maxchar)
8296 maxchar = fixed;
8297 PyUnicode_WRITE(kind, data, i, fixed);
8298 }
8299 else if (ch > maxchar)
8300 maxchar = ch;
8301 }
8302 else if (ch > maxchar)
8303 maxchar = ch;
8304 }
8305
8306 return maxchar;
8307}
8308
8309PyObject *
8310_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8311{
8312 if (!PyUnicode_Check(unicode)) {
8313 PyErr_BadInternalCall();
8314 return NULL;
8315 }
8316 if (PyUnicode_READY(unicode) == -1)
8317 return NULL;
8318 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8319 /* If the string is already ASCII, just return the same string */
8320 Py_INCREF(unicode);
8321 return unicode;
8322 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008323 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324}
8325
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008326PyObject *
8327PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8328 Py_ssize_t length)
8329{
8330 PyObject *result;
8331 Py_UNICODE *p; /* write pointer into result */
8332 Py_ssize_t i;
8333 /* Copy to a new string */
8334 result = (PyObject *)_PyUnicode_New(length);
8335 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8336 if (result == NULL)
8337 return result;
8338 p = PyUnicode_AS_UNICODE(result);
8339 /* Iterate over code points */
8340 for (i = 0; i < length; i++) {
8341 Py_UNICODE ch =s[i];
8342 if (ch > 127) {
8343 int decimal = Py_UNICODE_TODECIMAL(ch);
8344 if (decimal >= 0)
8345 p[i] = '0' + decimal;
8346 }
8347 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008348#ifndef DONT_MAKE_RESULT_READY
8349 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 Py_DECREF(result);
8351 return NULL;
8352 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008353#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008354 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008355 return result;
8356}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008357/* --- Decimal Encoder ---------------------------------------------------- */
8358
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359int
8360PyUnicode_EncodeDecimal(Py_UNICODE *s,
8361 Py_ssize_t length,
8362 char *output,
8363 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008364{
8365 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 PyObject *errorHandler = NULL;
8367 PyObject *exc = NULL;
8368 const char *encoding = "decimal";
8369 const char *reason = "invalid decimal Unicode string";
8370 /* the following variable is used for caching string comparisons
8371 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8372 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008373
8374 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 PyErr_BadArgument();
8376 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008377 }
8378
8379 p = s;
8380 end = s + length;
8381 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 register Py_UNICODE ch = *p;
8383 int decimal;
8384 PyObject *repunicode;
8385 Py_ssize_t repsize;
8386 Py_ssize_t newpos;
8387 Py_UNICODE *uni2;
8388 Py_UNICODE *collstart;
8389 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008390
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008392 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 ++p;
8394 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008395 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 decimal = Py_UNICODE_TODECIMAL(ch);
8397 if (decimal >= 0) {
8398 *output++ = '0' + decimal;
8399 ++p;
8400 continue;
8401 }
8402 if (0 < ch && ch < 256) {
8403 *output++ = (char)ch;
8404 ++p;
8405 continue;
8406 }
8407 /* All other characters are considered unencodable */
8408 collstart = p;
8409 collend = p+1;
8410 while (collend < end) {
8411 if ((0 < *collend && *collend < 256) ||
8412 !Py_UNICODE_ISSPACE(*collend) ||
8413 Py_UNICODE_TODECIMAL(*collend))
8414 break;
8415 }
8416 /* cache callback name lookup
8417 * (if not done yet, i.e. it's the first error) */
8418 if (known_errorHandler==-1) {
8419 if ((errors==NULL) || (!strcmp(errors, "strict")))
8420 known_errorHandler = 1;
8421 else if (!strcmp(errors, "replace"))
8422 known_errorHandler = 2;
8423 else if (!strcmp(errors, "ignore"))
8424 known_errorHandler = 3;
8425 else if (!strcmp(errors, "xmlcharrefreplace"))
8426 known_errorHandler = 4;
8427 else
8428 known_errorHandler = 0;
8429 }
8430 switch (known_errorHandler) {
8431 case 1: /* strict */
8432 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8433 goto onError;
8434 case 2: /* replace */
8435 for (p = collstart; p < collend; ++p)
8436 *output++ = '?';
8437 /* fall through */
8438 case 3: /* ignore */
8439 p = collend;
8440 break;
8441 case 4: /* xmlcharrefreplace */
8442 /* generate replacement (temporarily (mis)uses p) */
8443 for (p = collstart; p < collend; ++p)
8444 output += sprintf(output, "&#%d;", (int)*p);
8445 p = collend;
8446 break;
8447 default:
8448 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8449 encoding, reason, s, length, &exc,
8450 collstart-s, collend-s, &newpos);
8451 if (repunicode == NULL)
8452 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008453 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008454 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008455 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8456 Py_DECREF(repunicode);
8457 goto onError;
8458 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 /* generate replacement */
8460 repsize = PyUnicode_GET_SIZE(repunicode);
8461 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8462 Py_UNICODE ch = *uni2;
8463 if (Py_UNICODE_ISSPACE(ch))
8464 *output++ = ' ';
8465 else {
8466 decimal = Py_UNICODE_TODECIMAL(ch);
8467 if (decimal >= 0)
8468 *output++ = '0' + decimal;
8469 else if (0 < ch && ch < 256)
8470 *output++ = (char)ch;
8471 else {
8472 Py_DECREF(repunicode);
8473 raise_encode_exception(&exc, encoding,
8474 s, length, collstart-s, collend-s, reason);
8475 goto onError;
8476 }
8477 }
8478 }
8479 p = s + newpos;
8480 Py_DECREF(repunicode);
8481 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008482 }
8483 /* 0-terminate the output string */
8484 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485 Py_XDECREF(exc);
8486 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008487 return 0;
8488
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 Py_XDECREF(exc);
8491 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008492 return -1;
8493}
8494
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495/* --- Helpers ------------------------------------------------------------ */
8496
Victor Stinnerc3cec782011-10-05 21:24:08 +02008497#include "stringlib/asciilib.h"
8498#include "stringlib/fastsearch.h"
8499#include "stringlib/partition.h"
8500#include "stringlib/split.h"
8501#include "stringlib/count.h"
8502#include "stringlib/find.h"
8503#include "stringlib/localeutil.h"
8504#include "stringlib/undef.h"
8505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506#include "stringlib/ucs1lib.h"
8507#include "stringlib/fastsearch.h"
8508#include "stringlib/partition.h"
8509#include "stringlib/split.h"
8510#include "stringlib/count.h"
8511#include "stringlib/find.h"
8512#include "stringlib/localeutil.h"
8513#include "stringlib/undef.h"
8514
8515#include "stringlib/ucs2lib.h"
8516#include "stringlib/fastsearch.h"
8517#include "stringlib/partition.h"
8518#include "stringlib/split.h"
8519#include "stringlib/count.h"
8520#include "stringlib/find.h"
8521#include "stringlib/localeutil.h"
8522#include "stringlib/undef.h"
8523
8524#include "stringlib/ucs4lib.h"
8525#include "stringlib/fastsearch.h"
8526#include "stringlib/partition.h"
8527#include "stringlib/split.h"
8528#include "stringlib/count.h"
8529#include "stringlib/find.h"
8530#include "stringlib/localeutil.h"
8531#include "stringlib/undef.h"
8532
8533static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008534any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 Py_ssize_t start,
8536 Py_ssize_t end)
8537{
8538 int kind1, kind2, kind;
8539 void *buf1, *buf2;
8540 Py_ssize_t len1, len2, result;
8541
8542 kind1 = PyUnicode_KIND(s1);
8543 kind2 = PyUnicode_KIND(s2);
8544 kind = kind1 > kind2 ? kind1 : kind2;
8545 buf1 = PyUnicode_DATA(s1);
8546 buf2 = PyUnicode_DATA(s2);
8547 if (kind1 != kind)
8548 buf1 = _PyUnicode_AsKind(s1, kind);
8549 if (!buf1)
8550 return -2;
8551 if (kind2 != kind)
8552 buf2 = _PyUnicode_AsKind(s2, kind);
8553 if (!buf2) {
8554 if (kind1 != kind) PyMem_Free(buf1);
8555 return -2;
8556 }
8557 len1 = PyUnicode_GET_LENGTH(s1);
8558 len2 = PyUnicode_GET_LENGTH(s2);
8559
Victor Stinner794d5672011-10-10 03:21:36 +02008560 if (direction > 0) {
8561 switch(kind) {
8562 case PyUnicode_1BYTE_KIND:
8563 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8564 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8565 else
8566 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8567 break;
8568 case PyUnicode_2BYTE_KIND:
8569 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8570 break;
8571 case PyUnicode_4BYTE_KIND:
8572 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8573 break;
8574 default:
8575 assert(0); result = -2;
8576 }
8577 }
8578 else {
8579 switch(kind) {
8580 case PyUnicode_1BYTE_KIND:
8581 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8582 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8583 else
8584 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8585 break;
8586 case PyUnicode_2BYTE_KIND:
8587 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8588 break;
8589 case PyUnicode_4BYTE_KIND:
8590 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8591 break;
8592 default:
8593 assert(0); result = -2;
8594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 }
8596
8597 if (kind1 != kind)
8598 PyMem_Free(buf1);
8599 if (kind2 != kind)
8600 PyMem_Free(buf2);
8601
8602 return result;
8603}
8604
8605Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008606_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_ssize_t n_buffer,
8608 void *digits, Py_ssize_t n_digits,
8609 Py_ssize_t min_width,
8610 const char *grouping,
8611 const char *thousands_sep)
8612{
8613 switch(kind) {
8614 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008615 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8616 return _PyUnicode_ascii_InsertThousandsGrouping(
8617 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8618 min_width, grouping, thousands_sep);
8619 else
8620 return _PyUnicode_ucs1_InsertThousandsGrouping(
8621 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8622 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 case PyUnicode_2BYTE_KIND:
8624 return _PyUnicode_ucs2_InsertThousandsGrouping(
8625 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8626 min_width, grouping, thousands_sep);
8627 case PyUnicode_4BYTE_KIND:
8628 return _PyUnicode_ucs4_InsertThousandsGrouping(
8629 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8630 min_width, grouping, thousands_sep);
8631 }
8632 assert(0);
8633 return -1;
8634}
8635
8636
Eric Smith8c663262007-08-25 02:26:07 +00008637#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008638#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008639
Thomas Wouters477c8d52006-05-27 19:21:47 +00008640#include "stringlib/count.h"
8641#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008642
Thomas Wouters477c8d52006-05-27 19:21:47 +00008643/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008644#define ADJUST_INDICES(start, end, len) \
8645 if (end > len) \
8646 end = len; \
8647 else if (end < 0) { \
8648 end += len; \
8649 if (end < 0) \
8650 end = 0; \
8651 } \
8652 if (start < 0) { \
8653 start += len; \
8654 if (start < 0) \
8655 start = 0; \
8656 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008657
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658Py_ssize_t
8659PyUnicode_Count(PyObject *str,
8660 PyObject *substr,
8661 Py_ssize_t start,
8662 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008664 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008665 PyUnicodeObject* str_obj;
8666 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 int kind1, kind2, kind;
8668 void *buf1 = NULL, *buf2 = NULL;
8669 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008670
Thomas Wouters477c8d52006-05-27 19:21:47 +00008671 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008674 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008675 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 Py_DECREF(str_obj);
8677 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 }
Tim Petersced69f82003-09-16 20:30:58 +00008679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 kind1 = PyUnicode_KIND(str_obj);
8681 kind2 = PyUnicode_KIND(sub_obj);
8682 kind = kind1 > kind2 ? kind1 : kind2;
8683 buf1 = PyUnicode_DATA(str_obj);
8684 if (kind1 != kind)
8685 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8686 if (!buf1)
8687 goto onError;
8688 buf2 = PyUnicode_DATA(sub_obj);
8689 if (kind2 != kind)
8690 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8691 if (!buf2)
8692 goto onError;
8693 len1 = PyUnicode_GET_LENGTH(str_obj);
8694 len2 = PyUnicode_GET_LENGTH(sub_obj);
8695
8696 ADJUST_INDICES(start, end, len1);
8697 switch(kind) {
8698 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008699 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8700 result = asciilib_count(
8701 ((Py_UCS1*)buf1) + start, end - start,
8702 buf2, len2, PY_SSIZE_T_MAX
8703 );
8704 else
8705 result = ucs1lib_count(
8706 ((Py_UCS1*)buf1) + start, end - start,
8707 buf2, len2, PY_SSIZE_T_MAX
8708 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 break;
8710 case PyUnicode_2BYTE_KIND:
8711 result = ucs2lib_count(
8712 ((Py_UCS2*)buf1) + start, end - start,
8713 buf2, len2, PY_SSIZE_T_MAX
8714 );
8715 break;
8716 case PyUnicode_4BYTE_KIND:
8717 result = ucs4lib_count(
8718 ((Py_UCS4*)buf1) + start, end - start,
8719 buf2, len2, PY_SSIZE_T_MAX
8720 );
8721 break;
8722 default:
8723 assert(0); result = 0;
8724 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008725
8726 Py_DECREF(sub_obj);
8727 Py_DECREF(str_obj);
8728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 if (kind1 != kind)
8730 PyMem_Free(buf1);
8731 if (kind2 != kind)
8732 PyMem_Free(buf2);
8733
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 onError:
8736 Py_DECREF(sub_obj);
8737 Py_DECREF(str_obj);
8738 if (kind1 != kind && buf1)
8739 PyMem_Free(buf1);
8740 if (kind2 != kind && buf2)
8741 PyMem_Free(buf2);
8742 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743}
8744
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745Py_ssize_t
8746PyUnicode_Find(PyObject *str,
8747 PyObject *sub,
8748 Py_ssize_t start,
8749 Py_ssize_t end,
8750 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008752 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008753
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008757 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 Py_DECREF(str);
8760 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761 }
Tim Petersced69f82003-09-16 20:30:58 +00008762
Victor Stinner794d5672011-10-10 03:21:36 +02008763 result = any_find_slice(direction,
8764 str, sub, start, end
8765 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008766
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008768 Py_DECREF(sub);
8769
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 return result;
8771}
8772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773Py_ssize_t
8774PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8775 Py_ssize_t start, Py_ssize_t end,
8776 int direction)
8777{
8778 char *result;
8779 int kind;
8780 if (PyUnicode_READY(str) == -1)
8781 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008782 if (start < 0 || end < 0) {
8783 PyErr_SetString(PyExc_IndexError, "string index out of range");
8784 return -2;
8785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 if (end > PyUnicode_GET_LENGTH(str))
8787 end = PyUnicode_GET_LENGTH(str);
8788 kind = PyUnicode_KIND(str);
8789 result = findchar(PyUnicode_1BYTE_DATA(str)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008790 + kind*start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 kind,
8792 end-start, ch, direction);
8793 if (!result)
8794 return -1;
8795 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8796}
8797
Alexander Belopolsky40018472011-02-26 01:02:56 +00008798static int
8799tailmatch(PyUnicodeObject *self,
8800 PyUnicodeObject *substring,
8801 Py_ssize_t start,
8802 Py_ssize_t end,
8803 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 int kind_self;
8806 int kind_sub;
8807 void *data_self;
8808 void *data_sub;
8809 Py_ssize_t offset;
8810 Py_ssize_t i;
8811 Py_ssize_t end_sub;
8812
8813 if (PyUnicode_READY(self) == -1 ||
8814 PyUnicode_READY(substring) == -1)
8815 return 0;
8816
8817 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 return 1;
8819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8821 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 kind_self = PyUnicode_KIND(self);
8826 data_self = PyUnicode_DATA(self);
8827 kind_sub = PyUnicode_KIND(substring);
8828 data_sub = PyUnicode_DATA(substring);
8829 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8830
8831 if (direction > 0)
8832 offset = end;
8833 else
8834 offset = start;
8835
8836 if (PyUnicode_READ(kind_self, data_self, offset) ==
8837 PyUnicode_READ(kind_sub, data_sub, 0) &&
8838 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8839 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8840 /* If both are of the same kind, memcmp is sufficient */
8841 if (kind_self == kind_sub) {
8842 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008843 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 data_sub,
8845 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008846 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 }
8848 /* otherwise we have to compare each character by first accesing it */
8849 else {
8850 /* We do not need to compare 0 and len(substring)-1 because
8851 the if statement above ensured already that they are equal
8852 when we end up here. */
8853 // TODO: honor direction and do a forward or backwards search
8854 for (i = 1; i < end_sub; ++i) {
8855 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8856 PyUnicode_READ(kind_sub, data_sub, i))
8857 return 0;
8858 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 }
8862
8863 return 0;
8864}
8865
Alexander Belopolsky40018472011-02-26 01:02:56 +00008866Py_ssize_t
8867PyUnicode_Tailmatch(PyObject *str,
8868 PyObject *substr,
8869 Py_ssize_t start,
8870 Py_ssize_t end,
8871 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008873 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008874
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 str = PyUnicode_FromObject(str);
8876 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 substr = PyUnicode_FromObject(substr);
8879 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 Py_DECREF(str);
8881 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 }
Tim Petersced69f82003-09-16 20:30:58 +00008883
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 (PyUnicodeObject *)substr,
8886 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 Py_DECREF(str);
8888 Py_DECREF(substr);
8889 return result;
8890}
8891
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892/* Apply fixfct filter to the Unicode object self and return a
8893 reference to the modified object */
8894
Alexander Belopolsky40018472011-02-26 01:02:56 +00008895static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008896fixup(PyObject *self,
8897 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 PyObject *u;
8900 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 if (PyUnicode_READY(self) == -1)
8903 return NULL;
8904 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8905 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8906 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008911 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 /* fix functions return the new maximum character in a string,
8914 if the kind of the resulting unicode object does not change,
8915 everything is fine. Otherwise we need to change the string kind
8916 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008917 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 if (maxchar_new == 0)
8919 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8920 else if (maxchar_new <= 127)
8921 maxchar_new = 127;
8922 else if (maxchar_new <= 255)
8923 maxchar_new = 255;
8924 else if (maxchar_new <= 65535)
8925 maxchar_new = 65535;
8926 else
8927 maxchar_new = 1114111; /* 0x10ffff */
8928
8929 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 /* fixfct should return TRUE if it modified the buffer. If
8931 FALSE, return a reference to the original buffer instead
8932 (to save space, not time) */
8933 Py_INCREF(self);
8934 Py_DECREF(u);
8935 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 else if (maxchar_new == maxchar_old) {
8938 return u;
8939 }
8940 else {
8941 /* In case the maximum character changed, we need to
8942 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008943 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 if (v == NULL) {
8945 Py_DECREF(u);
8946 return NULL;
8947 }
8948 if (maxchar_new > maxchar_old) {
8949 /* If the maxchar increased so that the kind changed, not all
8950 characters are representable anymore and we need to fix the
8951 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008952 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008953 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8955 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008956 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008957 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959
8960 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008961 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 return v;
8963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964}
8965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008967fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 /* No need to call PyUnicode_READY(self) because this function is only
8970 called as a callback from fixup() which does it already. */
8971 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8972 const int kind = PyUnicode_KIND(self);
8973 void *data = PyUnicode_DATA(self);
8974 int touched = 0;
8975 Py_UCS4 maxchar = 0;
8976 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 for (i = 0; i < len; ++i) {
8979 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8980 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8981 if (up != ch) {
8982 if (up > maxchar)
8983 maxchar = up;
8984 PyUnicode_WRITE(kind, data, i, up);
8985 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 else if (ch > maxchar)
8988 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 }
8990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 if (touched)
8992 return maxchar;
8993 else
8994 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995}
8996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008998fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9001 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9002 const int kind = PyUnicode_KIND(self);
9003 void *data = PyUnicode_DATA(self);
9004 int touched = 0;
9005 Py_UCS4 maxchar = 0;
9006 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 for(i = 0; i < len; ++i) {
9009 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9010 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9011 if (lo != ch) {
9012 if (lo > maxchar)
9013 maxchar = lo;
9014 PyUnicode_WRITE(kind, data, i, lo);
9015 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 else if (ch > maxchar)
9018 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 }
9020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 if (touched)
9022 return maxchar;
9023 else
9024 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025}
9026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009028fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9031 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9032 const int kind = PyUnicode_KIND(self);
9033 void *data = PyUnicode_DATA(self);
9034 int touched = 0;
9035 Py_UCS4 maxchar = 0;
9036 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 for(i = 0; i < len; ++i) {
9039 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9040 Py_UCS4 nu = 0;
9041
9042 if (Py_UNICODE_ISUPPER(ch))
9043 nu = Py_UNICODE_TOLOWER(ch);
9044 else if (Py_UNICODE_ISLOWER(ch))
9045 nu = Py_UNICODE_TOUPPER(ch);
9046
9047 if (nu != 0) {
9048 if (nu > maxchar)
9049 maxchar = nu;
9050 PyUnicode_WRITE(kind, data, i, nu);
9051 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 else if (ch > maxchar)
9054 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 }
9056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 if (touched)
9058 return maxchar;
9059 else
9060 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061}
9062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009064fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9067 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9068 const int kind = PyUnicode_KIND(self);
9069 void *data = PyUnicode_DATA(self);
9070 int touched = 0;
9071 Py_UCS4 maxchar = 0;
9072 Py_ssize_t i = 0;
9073 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009074
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009075 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077
9078 ch = PyUnicode_READ(kind, data, i);
9079 if (!Py_UNICODE_ISUPPER(ch)) {
9080 maxchar = Py_UNICODE_TOUPPER(ch);
9081 PyUnicode_WRITE(kind, data, i, maxchar);
9082 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 ++i;
9085 for(; i < len; ++i) {
9086 ch = PyUnicode_READ(kind, data, i);
9087 if (!Py_UNICODE_ISLOWER(ch)) {
9088 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9089 if (lo > maxchar)
9090 maxchar = lo;
9091 PyUnicode_WRITE(kind, data, i, lo);
9092 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 else if (ch > maxchar)
9095 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097
9098 if (touched)
9099 return maxchar;
9100 else
9101 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102}
9103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009105fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9108 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9109 const int kind = PyUnicode_KIND(self);
9110 void *data = PyUnicode_DATA(self);
9111 Py_UCS4 maxchar = 0;
9112 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 int previous_is_cased;
9114
9115 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 if (len == 1) {
9117 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9118 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9119 if (ti != ch) {
9120 PyUnicode_WRITE(kind, data, i, ti);
9121 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 }
9123 else
9124 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 for(; i < len; ++i) {
9128 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9129 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009130
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 nu = Py_UNICODE_TOTITLE(ch);
9135
9136 if (nu > maxchar)
9137 maxchar = nu;
9138 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009139
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 if (Py_UNICODE_ISLOWER(ch) ||
9141 Py_UNICODE_ISUPPER(ch) ||
9142 Py_UNICODE_ISTITLE(ch))
9143 previous_is_cased = 1;
9144 else
9145 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148}
9149
Tim Peters8ce9f162004-08-27 01:49:32 +00009150PyObject *
9151PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009154 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009156 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009157 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9158 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009159 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009161 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009163 int use_memcpy;
9164 unsigned char *res_data = NULL, *sep_data = NULL;
9165 PyObject *last_obj;
9166 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167
Tim Peters05eba1f2004-08-27 21:32:02 +00009168 fseq = PySequence_Fast(seq, "");
9169 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009170 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009171 }
9172
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009173 /* NOTE: the following code can't call back into Python code,
9174 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009175 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009176
Tim Peters05eba1f2004-08-27 21:32:02 +00009177 seqlen = PySequence_Fast_GET_SIZE(fseq);
9178 /* If empty sequence, return u"". */
9179 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009180 Py_DECREF(fseq);
9181 Py_INCREF(unicode_empty);
9182 res = unicode_empty;
9183 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009184 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009185
Tim Peters05eba1f2004-08-27 21:32:02 +00009186 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009187 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009188 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009189 if (seqlen == 1) {
9190 if (PyUnicode_CheckExact(items[0])) {
9191 res = items[0];
9192 Py_INCREF(res);
9193 Py_DECREF(fseq);
9194 return res;
9195 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009196 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009197 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009198 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009199 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009200 /* Set up sep and seplen */
9201 if (separator == NULL) {
9202 /* fall back to a blank space separator */
9203 sep = PyUnicode_FromOrdinal(' ');
9204 if (!sep)
9205 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009206 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009207 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009208 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009209 else {
9210 if (!PyUnicode_Check(separator)) {
9211 PyErr_Format(PyExc_TypeError,
9212 "separator: expected str instance,"
9213 " %.80s found",
9214 Py_TYPE(separator)->tp_name);
9215 goto onError;
9216 }
9217 if (PyUnicode_READY(separator))
9218 goto onError;
9219 sep = separator;
9220 seplen = PyUnicode_GET_LENGTH(separator);
9221 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9222 /* inc refcount to keep this code path symmetric with the
9223 above case of a blank separator */
9224 Py_INCREF(sep);
9225 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009226 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009227 }
9228
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009229 /* There are at least two things to join, or else we have a subclass
9230 * of str in the sequence.
9231 * Do a pre-pass to figure out the total amount of space we'll
9232 * need (sz), and see whether all argument are strings.
9233 */
9234 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009235#ifdef Py_DEBUG
9236 use_memcpy = 0;
9237#else
9238 use_memcpy = 1;
9239#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009240 for (i = 0; i < seqlen; i++) {
9241 const Py_ssize_t old_sz = sz;
9242 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 if (!PyUnicode_Check(item)) {
9244 PyErr_Format(PyExc_TypeError,
9245 "sequence item %zd: expected str instance,"
9246 " %.80s found",
9247 i, Py_TYPE(item)->tp_name);
9248 goto onError;
9249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 if (PyUnicode_READY(item) == -1)
9251 goto onError;
9252 sz += PyUnicode_GET_LENGTH(item);
9253 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009254 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009255 if (i != 0)
9256 sz += seplen;
9257 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9258 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009259 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009260 goto onError;
9261 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009262 if (use_memcpy && last_obj != NULL) {
9263 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9264 use_memcpy = 0;
9265 }
9266 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009267 }
Tim Petersced69f82003-09-16 20:30:58 +00009268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009270 if (res == NULL)
9271 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009272
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009273 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009274#ifdef Py_DEBUG
9275 use_memcpy = 0;
9276#else
9277 if (use_memcpy) {
9278 res_data = PyUnicode_1BYTE_DATA(res);
9279 kind = PyUnicode_KIND(res);
9280 if (seplen != 0)
9281 sep_data = PyUnicode_1BYTE_DATA(sep);
9282 }
9283#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009285 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009286 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009287 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009288 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009289 if (use_memcpy) {
9290 Py_MEMCPY(res_data,
9291 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009292 kind * seplen);
9293 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009294 }
9295 else {
9296 copy_characters(res, res_offset, sep, 0, seplen);
9297 res_offset += seplen;
9298 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009300 itemlen = PyUnicode_GET_LENGTH(item);
9301 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009302 if (use_memcpy) {
9303 Py_MEMCPY(res_data,
9304 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009305 kind * itemlen);
9306 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009307 }
9308 else {
9309 copy_characters(res, res_offset, item, 0, itemlen);
9310 res_offset += itemlen;
9311 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009312 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009313 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009314 if (use_memcpy)
9315 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009316 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009317 else
9318 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009319
Tim Peters05eba1f2004-08-27 21:32:02 +00009320 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009322 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009326 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009328 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329 return NULL;
9330}
9331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332#define FILL(kind, data, value, start, length) \
9333 do { \
9334 Py_ssize_t i_ = 0; \
9335 assert(kind != PyUnicode_WCHAR_KIND); \
9336 switch ((kind)) { \
9337 case PyUnicode_1BYTE_KIND: { \
9338 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9339 memset(to_, (unsigned char)value, length); \
9340 break; \
9341 } \
9342 case PyUnicode_2BYTE_KIND: { \
9343 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9344 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9345 break; \
9346 } \
9347 default: { \
9348 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9349 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9350 break; \
9351 } \
9352 } \
9353 } while (0)
9354
Victor Stinner9310abb2011-10-05 00:59:23 +02009355static PyObject *
9356pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009357 Py_ssize_t left,
9358 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 PyObject *u;
9362 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009363 int kind;
9364 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365
9366 if (left < 0)
9367 left = 0;
9368 if (right < 0)
9369 right = 0;
9370
Tim Peters7a29bd52001-09-12 03:03:31 +00009371 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372 Py_INCREF(self);
9373 return self;
9374 }
9375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9377 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009378 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9379 return NULL;
9380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9382 if (fill > maxchar)
9383 maxchar = fill;
9384 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009385 if (!u)
9386 return NULL;
9387
9388 kind = PyUnicode_KIND(u);
9389 data = PyUnicode_DATA(u);
9390 if (left)
9391 FILL(kind, data, fill, 0, left);
9392 if (right)
9393 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009394 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009395 assert(_PyUnicode_CheckConsistency(u, 1));
9396 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399
Alexander Belopolsky40018472011-02-26 01:02:56 +00009400PyObject *
9401PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404
9405 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 switch(PyUnicode_KIND(string)) {
9410 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009411 if (PyUnicode_IS_ASCII(string))
9412 list = asciilib_splitlines(
9413 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9414 PyUnicode_GET_LENGTH(string), keepends);
9415 else
9416 list = ucs1lib_splitlines(
9417 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9418 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 break;
9420 case PyUnicode_2BYTE_KIND:
9421 list = ucs2lib_splitlines(
9422 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9423 PyUnicode_GET_LENGTH(string), keepends);
9424 break;
9425 case PyUnicode_4BYTE_KIND:
9426 list = ucs4lib_splitlines(
9427 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9428 PyUnicode_GET_LENGTH(string), keepends);
9429 break;
9430 default:
9431 assert(0);
9432 list = 0;
9433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 Py_DECREF(string);
9435 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436}
9437
Alexander Belopolsky40018472011-02-26 01:02:56 +00009438static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009439split(PyObject *self,
9440 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009441 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 int kind1, kind2, kind;
9444 void *buf1, *buf2;
9445 Py_ssize_t len1, len2;
9446 PyObject* out;
9447
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009449 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 if (PyUnicode_READY(self) == -1)
9452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 if (substring == NULL)
9455 switch(PyUnicode_KIND(self)) {
9456 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009457 if (PyUnicode_IS_ASCII(self))
9458 return asciilib_split_whitespace(
9459 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9460 PyUnicode_GET_LENGTH(self), maxcount
9461 );
9462 else
9463 return ucs1lib_split_whitespace(
9464 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9465 PyUnicode_GET_LENGTH(self), maxcount
9466 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 case PyUnicode_2BYTE_KIND:
9468 return ucs2lib_split_whitespace(
9469 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9470 PyUnicode_GET_LENGTH(self), maxcount
9471 );
9472 case PyUnicode_4BYTE_KIND:
9473 return ucs4lib_split_whitespace(
9474 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9475 PyUnicode_GET_LENGTH(self), maxcount
9476 );
9477 default:
9478 assert(0);
9479 return NULL;
9480 }
9481
9482 if (PyUnicode_READY(substring) == -1)
9483 return NULL;
9484
9485 kind1 = PyUnicode_KIND(self);
9486 kind2 = PyUnicode_KIND(substring);
9487 kind = kind1 > kind2 ? kind1 : kind2;
9488 buf1 = PyUnicode_DATA(self);
9489 buf2 = PyUnicode_DATA(substring);
9490 if (kind1 != kind)
9491 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9492 if (!buf1)
9493 return NULL;
9494 if (kind2 != kind)
9495 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9496 if (!buf2) {
9497 if (kind1 != kind) PyMem_Free(buf1);
9498 return NULL;
9499 }
9500 len1 = PyUnicode_GET_LENGTH(self);
9501 len2 = PyUnicode_GET_LENGTH(substring);
9502
9503 switch(kind) {
9504 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009505 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9506 out = asciilib_split(
9507 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9508 else
9509 out = ucs1lib_split(
9510 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 break;
9512 case PyUnicode_2BYTE_KIND:
9513 out = ucs2lib_split(
9514 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9515 break;
9516 case PyUnicode_4BYTE_KIND:
9517 out = ucs4lib_split(
9518 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9519 break;
9520 default:
9521 out = NULL;
9522 }
9523 if (kind1 != kind)
9524 PyMem_Free(buf1);
9525 if (kind2 != kind)
9526 PyMem_Free(buf2);
9527 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528}
9529
Alexander Belopolsky40018472011-02-26 01:02:56 +00009530static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009531rsplit(PyObject *self,
9532 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009533 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 int kind1, kind2, kind;
9536 void *buf1, *buf2;
9537 Py_ssize_t len1, len2;
9538 PyObject* out;
9539
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009540 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009541 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 if (PyUnicode_READY(self) == -1)
9544 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 if (substring == NULL)
9547 switch(PyUnicode_KIND(self)) {
9548 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009549 if (PyUnicode_IS_ASCII(self))
9550 return asciilib_rsplit_whitespace(
9551 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9552 PyUnicode_GET_LENGTH(self), maxcount
9553 );
9554 else
9555 return ucs1lib_rsplit_whitespace(
9556 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9557 PyUnicode_GET_LENGTH(self), maxcount
9558 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 case PyUnicode_2BYTE_KIND:
9560 return ucs2lib_rsplit_whitespace(
9561 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9562 PyUnicode_GET_LENGTH(self), maxcount
9563 );
9564 case PyUnicode_4BYTE_KIND:
9565 return ucs4lib_rsplit_whitespace(
9566 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9567 PyUnicode_GET_LENGTH(self), maxcount
9568 );
9569 default:
9570 assert(0);
9571 return NULL;
9572 }
9573
9574 if (PyUnicode_READY(substring) == -1)
9575 return NULL;
9576
9577 kind1 = PyUnicode_KIND(self);
9578 kind2 = PyUnicode_KIND(substring);
9579 kind = kind1 > kind2 ? kind1 : kind2;
9580 buf1 = PyUnicode_DATA(self);
9581 buf2 = PyUnicode_DATA(substring);
9582 if (kind1 != kind)
9583 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9584 if (!buf1)
9585 return NULL;
9586 if (kind2 != kind)
9587 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9588 if (!buf2) {
9589 if (kind1 != kind) PyMem_Free(buf1);
9590 return NULL;
9591 }
9592 len1 = PyUnicode_GET_LENGTH(self);
9593 len2 = PyUnicode_GET_LENGTH(substring);
9594
9595 switch(kind) {
9596 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009597 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9598 out = asciilib_rsplit(
9599 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9600 else
9601 out = ucs1lib_rsplit(
9602 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 break;
9604 case PyUnicode_2BYTE_KIND:
9605 out = ucs2lib_rsplit(
9606 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9607 break;
9608 case PyUnicode_4BYTE_KIND:
9609 out = ucs4lib_rsplit(
9610 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9611 break;
9612 default:
9613 out = NULL;
9614 }
9615 if (kind1 != kind)
9616 PyMem_Free(buf1);
9617 if (kind2 != kind)
9618 PyMem_Free(buf2);
9619 return out;
9620}
9621
9622static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009623anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9624 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625{
9626 switch(kind) {
9627 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009628 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9629 return asciilib_find(buf1, len1, buf2, len2, offset);
9630 else
9631 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 case PyUnicode_2BYTE_KIND:
9633 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9634 case PyUnicode_4BYTE_KIND:
9635 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9636 }
9637 assert(0);
9638 return -1;
9639}
9640
9641static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009642anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9643 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644{
9645 switch(kind) {
9646 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009647 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9648 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9649 else
9650 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 case PyUnicode_2BYTE_KIND:
9652 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9653 case PyUnicode_4BYTE_KIND:
9654 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9655 }
9656 assert(0);
9657 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009658}
9659
Alexander Belopolsky40018472011-02-26 01:02:56 +00009660static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661replace(PyObject *self, PyObject *str1,
9662 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 PyObject *u;
9665 char *sbuf = PyUnicode_DATA(self);
9666 char *buf1 = PyUnicode_DATA(str1);
9667 char *buf2 = PyUnicode_DATA(str2);
9668 int srelease = 0, release1 = 0, release2 = 0;
9669 int skind = PyUnicode_KIND(self);
9670 int kind1 = PyUnicode_KIND(str1);
9671 int kind2 = PyUnicode_KIND(str2);
9672 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9673 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9674 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675
9676 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009677 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009679 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680
Victor Stinner59de0ee2011-10-07 10:01:28 +02009681 if (str1 == str2)
9682 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 if (skind < kind1)
9684 /* substring too wide to be present */
9685 goto nothing;
9686
9687 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009688 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009689 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009691 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009693 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 Py_UCS4 u1, u2, maxchar;
9695 int mayshrink, rkind;
9696 u1 = PyUnicode_READ_CHAR(str1, 0);
9697 if (!findchar(sbuf, PyUnicode_KIND(self),
9698 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009699 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 u2 = PyUnicode_READ_CHAR(str2, 0);
9701 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9702 /* Replacing u1 with u2 may cause a maxchar reduction in the
9703 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 if (u2 > maxchar) {
9705 maxchar = u2;
9706 mayshrink = 0;
9707 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009708 else
9709 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009711 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009713 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 rkind = PyUnicode_KIND(u);
9715 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9716 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009717 if (--maxcount < 0)
9718 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009722 unicode_adjust_maxchar(&u);
9723 if (u == NULL)
9724 goto error;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 int rkind = skind;
9728 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009729 PyObject *rstr;
9730 Py_UCS4 maxchar;
9731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 if (kind1 < rkind) {
9733 /* widen substring */
9734 buf1 = _PyUnicode_AsKind(str1, rkind);
9735 if (!buf1) goto error;
9736 release1 = 1;
9737 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009738 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009739 if (i < 0)
9740 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 if (rkind > kind2) {
9742 /* widen replacement */
9743 buf2 = _PyUnicode_AsKind(str2, rkind);
9744 if (!buf2) goto error;
9745 release2 = 1;
9746 }
9747 else if (rkind < kind2) {
9748 /* widen self and buf1 */
9749 rkind = kind2;
9750 if (release1) PyMem_Free(buf1);
9751 sbuf = _PyUnicode_AsKind(self, rkind);
9752 if (!sbuf) goto error;
9753 srelease = 1;
9754 buf1 = _PyUnicode_AsKind(str1, rkind);
9755 if (!buf1) goto error;
9756 release1 = 1;
9757 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009758 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9759 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9760 rstr = PyUnicode_New(slen, maxchar);
9761 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009763 res = PyUnicode_DATA(rstr);
9764
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009765 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009766 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009767 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009769 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009771
9772 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009773 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009774 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009775 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009776 if (i == -1)
9777 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009778 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009780 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783
Victor Stinner25a4b292011-10-06 12:31:55 +02009784 u = rstr;
9785 unicode_adjust_maxchar(&u);
9786 if (!u)
9787 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 Py_ssize_t n, i, j, ires;
9792 Py_ssize_t product, new_size;
9793 int rkind = skind;
Victor Stinner25a4b292011-10-06 12:31:55 +02009794 PyObject *rstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009796 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009798 if (kind1 < rkind) {
9799 buf1 = _PyUnicode_AsKind(str1, rkind);
9800 if (!buf1) goto error;
9801 release1 = 1;
9802 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009803 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009804 if (n == 0)
9805 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 if (kind2 < rkind) {
9807 buf2 = _PyUnicode_AsKind(str2, rkind);
9808 if (!buf2) goto error;
9809 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 else if (kind2 > rkind) {
9812 rkind = kind2;
9813 sbuf = _PyUnicode_AsKind(self, rkind);
9814 if (!sbuf) goto error;
9815 srelease = 1;
9816 if (release1) PyMem_Free(buf1);
9817 buf1 = _PyUnicode_AsKind(str1, rkind);
9818 if (!buf1) goto error;
9819 release1 = 1;
9820 }
9821 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9822 PyUnicode_GET_LENGTH(str1))); */
9823 product = n * (len2-len1);
9824 if ((product / (len2-len1)) != n) {
9825 PyErr_SetString(PyExc_OverflowError,
9826 "replace string is too long");
9827 goto error;
9828 }
9829 new_size = slen + product;
9830 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9831 PyErr_SetString(PyExc_OverflowError,
9832 "replace string is too long");
9833 goto error;
9834 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009835 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9836 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9837 rstr = PyUnicode_New(new_size, maxchar);
9838 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009840 res = PyUnicode_DATA(rstr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 ires = i = 0;
9842 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009843 while (n-- > 0) {
9844 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009845 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009846 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009847 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009848 if (j == -1)
9849 break;
9850 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009851 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009852 memcpy(res + rkind * ires,
9853 sbuf + rkind * i,
9854 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009856 }
9857 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009859 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009861 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009867 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009868 memcpy(res + rkind * ires,
9869 sbuf + rkind * i,
9870 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009871 } else {
9872 /* interleave */
9873 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009874 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009876 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009878 if (--n <= 0)
9879 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009880 memcpy(res + rkind * ires,
9881 sbuf + rkind * i,
9882 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 ires++;
9884 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009885 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009886 memcpy(res + rkind * ires,
9887 sbuf + rkind * i,
9888 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009889 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009890 u = rstr;
9891 unicode_adjust_maxchar(&u);
9892 if (u == NULL)
9893 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 if (srelease)
9896 PyMem_FREE(sbuf);
9897 if (release1)
9898 PyMem_FREE(buf1);
9899 if (release2)
9900 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009901 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009903
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009905 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 if (srelease)
9907 PyMem_FREE(sbuf);
9908 if (release1)
9909 PyMem_FREE(buf1);
9910 if (release2)
9911 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009912 if (PyUnicode_CheckExact(self)) {
9913 Py_INCREF(self);
9914 return (PyObject *) self;
9915 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009916 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 error:
9918 if (srelease && sbuf)
9919 PyMem_FREE(sbuf);
9920 if (release1 && buf1)
9921 PyMem_FREE(buf1);
9922 if (release2 && buf2)
9923 PyMem_FREE(buf2);
9924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925}
9926
9927/* --- Unicode Object Methods --------------------------------------------- */
9928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009929PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009930 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931\n\
9932Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009933characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934
9935static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009936unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938 return fixup(self, fixtitle);
9939}
9940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009941PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943\n\
9944Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009945have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946
9947static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009948unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950 return fixup(self, fixcapitalize);
9951}
9952
9953#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009954PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009955 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956\n\
9957Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009958normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959
9960static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009961unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962{
9963 PyObject *list;
9964 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009965 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967 /* Split into words */
9968 list = split(self, NULL, -1);
9969 if (!list)
9970 return NULL;
9971
9972 /* Capitalize each word */
9973 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9974 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976 if (item == NULL)
9977 goto onError;
9978 Py_DECREF(PyList_GET_ITEM(list, i));
9979 PyList_SET_ITEM(list, i, item);
9980 }
9981
9982 /* Join the words to form a new string */
9983 item = PyUnicode_Join(NULL, list);
9984
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986 Py_DECREF(list);
9987 return (PyObject *)item;
9988}
9989#endif
9990
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009991/* Argument converter. Coerces to a single unicode character */
9992
9993static int
9994convert_uc(PyObject *obj, void *addr)
9995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009997 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009998
Benjamin Peterson14339b62009-01-31 16:36:08 +00009999 uniobj = PyUnicode_FromObject(obj);
10000 if (uniobj == NULL) {
10001 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010003 return 0;
10004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010006 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010007 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010008 Py_DECREF(uniobj);
10009 return 0;
10010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010012 Py_DECREF(uniobj);
10013 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010014}
10015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010016PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010017 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010019Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010020done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021
10022static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010023unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010025 Py_ssize_t marg, left;
10026 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 Py_UCS4 fillchar = ' ';
10028
Victor Stinnere9a29352011-10-01 02:14:59 +020010029 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031
Victor Stinnere9a29352011-10-01 02:14:59 +020010032 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033 return NULL;
10034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036 Py_INCREF(self);
10037 return (PyObject*) self;
10038 }
10039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041 left = marg / 2 + (marg & width & 1);
10042
Victor Stinner9310abb2011-10-05 00:59:23 +020010043 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044}
10045
Marc-André Lemburge5034372000-08-08 08:04:29 +000010046#if 0
10047
10048/* This code should go into some future Unicode collation support
10049 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +000010050 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +000010051
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010052/* speedy UTF-16 code point order comparison */
10053/* gleaned from: */
10054/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
10055
Marc-André Lemburge12896e2000-07-07 17:51:08 +000010056static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010057{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010058 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +000010059 0, 0, 0, 0, 0, 0, 0, 0,
10060 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +000010061 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010062};
10063
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064static int
10065unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10066{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010067 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010068
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 Py_UNICODE *s1 = str1->str;
10070 Py_UNICODE *s2 = str2->str;
10071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 len1 = str1->_base._base.length;
10073 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +000010074
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +000010076 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010077
10078 c1 = *s1++;
10079 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +000010080
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 if (c1 > (1<<11) * 26)
10082 c1 += utf16Fixup[c1>>11];
10083 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010084 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010085 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +000010086
10087 if (c1 != c2)
10088 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +000010089
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010090 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091 }
10092
10093 return (len1 < len2) ? -1 : (len1 != len2);
10094}
10095
Marc-André Lemburge5034372000-08-08 08:04:29 +000010096#else
10097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098/* This function assumes that str1 and str2 are readied by the caller. */
10099
Marc-André Lemburge5034372000-08-08 08:04:29 +000010100static int
10101unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 int kind1, kind2;
10104 void *data1, *data2;
10105 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 kind1 = PyUnicode_KIND(str1);
10108 kind2 = PyUnicode_KIND(str2);
10109 data1 = PyUnicode_DATA(str1);
10110 data2 = PyUnicode_DATA(str2);
10111 len1 = PyUnicode_GET_LENGTH(str1);
10112 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 for (i = 0; i < len1 && i < len2; ++i) {
10115 Py_UCS4 c1, c2;
10116 c1 = PyUnicode_READ(kind1, data1, i);
10117 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010118
10119 if (c1 != c2)
10120 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010121 }
10122
10123 return (len1 < len2) ? -1 : (len1 != len2);
10124}
10125
10126#endif
10127
Alexander Belopolsky40018472011-02-26 01:02:56 +000010128int
10129PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10132 if (PyUnicode_READY(left) == -1 ||
10133 PyUnicode_READY(right) == -1)
10134 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010135 return unicode_compare((PyUnicodeObject *)left,
10136 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010138 PyErr_Format(PyExc_TypeError,
10139 "Can't compare %.100s and %.100s",
10140 left->ob_type->tp_name,
10141 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142 return -1;
10143}
10144
Martin v. Löwis5b222132007-06-10 09:51:05 +000010145int
10146PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 Py_ssize_t i;
10149 int kind;
10150 void *data;
10151 Py_UCS4 chr;
10152
Victor Stinner910337b2011-10-03 03:20:16 +020010153 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 if (PyUnicode_READY(uni) == -1)
10155 return -1;
10156 kind = PyUnicode_KIND(uni);
10157 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010158 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10160 if (chr != str[i])
10161 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010162 /* This check keeps Python strings that end in '\0' from comparing equal
10163 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010165 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010166 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010167 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010168 return 0;
10169}
10170
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010171
Benjamin Peterson29060642009-01-31 22:14:21 +000010172#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010173 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010174
Alexander Belopolsky40018472011-02-26 01:02:56 +000010175PyObject *
10176PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010177{
10178 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010179
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010180 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10181 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (PyUnicode_READY(left) == -1 ||
10183 PyUnicode_READY(right) == -1)
10184 return NULL;
10185 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10186 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010187 if (op == Py_EQ) {
10188 Py_INCREF(Py_False);
10189 return Py_False;
10190 }
10191 if (op == Py_NE) {
10192 Py_INCREF(Py_True);
10193 return Py_True;
10194 }
10195 }
10196 if (left == right)
10197 result = 0;
10198 else
10199 result = unicode_compare((PyUnicodeObject *)left,
10200 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010202 /* Convert the return value to a Boolean */
10203 switch (op) {
10204 case Py_EQ:
10205 v = TEST_COND(result == 0);
10206 break;
10207 case Py_NE:
10208 v = TEST_COND(result != 0);
10209 break;
10210 case Py_LE:
10211 v = TEST_COND(result <= 0);
10212 break;
10213 case Py_GE:
10214 v = TEST_COND(result >= 0);
10215 break;
10216 case Py_LT:
10217 v = TEST_COND(result == -1);
10218 break;
10219 case Py_GT:
10220 v = TEST_COND(result == 1);
10221 break;
10222 default:
10223 PyErr_BadArgument();
10224 return NULL;
10225 }
10226 Py_INCREF(v);
10227 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010228 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010229
Brian Curtindfc80e32011-08-10 20:28:54 -050010230 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010231}
10232
Alexander Belopolsky40018472011-02-26 01:02:56 +000010233int
10234PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010235{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 int kind1, kind2, kind;
10238 void *buf1, *buf2;
10239 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010240 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010241
10242 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010243 sub = PyUnicode_FromObject(element);
10244 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010245 PyErr_Format(PyExc_TypeError,
10246 "'in <string>' requires string as left operand, not %s",
10247 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (PyUnicode_READY(sub) == -1)
10251 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010252
Thomas Wouters477c8d52006-05-27 19:21:47 +000010253 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010254 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 Py_DECREF(sub);
10256 return -1;
10257 }
10258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 kind1 = PyUnicode_KIND(str);
10260 kind2 = PyUnicode_KIND(sub);
10261 kind = kind1 > kind2 ? kind1 : kind2;
10262 buf1 = PyUnicode_DATA(str);
10263 buf2 = PyUnicode_DATA(sub);
10264 if (kind1 != kind)
10265 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10266 if (!buf1) {
10267 Py_DECREF(sub);
10268 return -1;
10269 }
10270 if (kind2 != kind)
10271 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10272 if (!buf2) {
10273 Py_DECREF(sub);
10274 if (kind1 != kind) PyMem_Free(buf1);
10275 return -1;
10276 }
10277 len1 = PyUnicode_GET_LENGTH(str);
10278 len2 = PyUnicode_GET_LENGTH(sub);
10279
10280 switch(kind) {
10281 case PyUnicode_1BYTE_KIND:
10282 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10283 break;
10284 case PyUnicode_2BYTE_KIND:
10285 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10286 break;
10287 case PyUnicode_4BYTE_KIND:
10288 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10289 break;
10290 default:
10291 result = -1;
10292 assert(0);
10293 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294
10295 Py_DECREF(str);
10296 Py_DECREF(sub);
10297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 if (kind1 != kind)
10299 PyMem_Free(buf1);
10300 if (kind2 != kind)
10301 PyMem_Free(buf2);
10302
Guido van Rossum403d68b2000-03-13 15:55:09 +000010303 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010304}
10305
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306/* Concat to string or Unicode object giving a new Unicode object. */
10307
Alexander Belopolsky40018472011-02-26 01:02:56 +000010308PyObject *
10309PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 PyObject *u = NULL, *v = NULL, *w;
10312 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010317 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010320 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321
10322 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010323 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010324 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010327 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010328 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330 }
10331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010333 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 w = PyUnicode_New(
10337 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10338 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010340 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010341 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10342 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343 Py_DECREF(u);
10344 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010345 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347
Benjamin Peterson29060642009-01-31 22:14:21 +000010348 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349 Py_XDECREF(u);
10350 Py_XDECREF(v);
10351 return NULL;
10352}
10353
Victor Stinnerb0923652011-10-04 01:17:31 +020010354static void
10355unicode_append_inplace(PyObject **p_left, PyObject *right)
10356{
10357 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010358
10359 assert(PyUnicode_IS_READY(*p_left));
10360 assert(PyUnicode_IS_READY(right));
10361
10362 left_len = PyUnicode_GET_LENGTH(*p_left);
10363 right_len = PyUnicode_GET_LENGTH(right);
10364 if (left_len > PY_SSIZE_T_MAX - right_len) {
10365 PyErr_SetString(PyExc_OverflowError,
10366 "strings are too large to concat");
10367 goto error;
10368 }
10369 new_len = left_len + right_len;
10370
10371 /* Now we own the last reference to 'left', so we can resize it
10372 * in-place.
10373 */
10374 if (unicode_resize(p_left, new_len) != 0) {
10375 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10376 * deallocated so it cannot be put back into
10377 * 'variable'. The MemoryError is raised when there
10378 * is no value in 'variable', which might (very
10379 * remotely) be a cause of incompatibilities.
10380 */
10381 goto error;
10382 }
10383 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010384 copy_characters(*p_left, left_len, right, 0, right_len);
10385 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010386 return;
10387
10388error:
10389 Py_DECREF(*p_left);
10390 *p_left = NULL;
10391}
10392
Walter Dörwald1ab83302007-05-18 17:15:44 +000010393void
Victor Stinner23e56682011-10-03 03:54:37 +020010394PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010395{
Victor Stinner23e56682011-10-03 03:54:37 +020010396 PyObject *left, *res;
10397
10398 if (p_left == NULL) {
10399 if (!PyErr_Occurred())
10400 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010401 return;
10402 }
Victor Stinner23e56682011-10-03 03:54:37 +020010403 left = *p_left;
10404 if (right == NULL || !PyUnicode_Check(left)) {
10405 if (!PyErr_Occurred())
10406 PyErr_BadInternalCall();
10407 goto error;
10408 }
10409
Victor Stinnere1335c72011-10-04 20:53:03 +020010410 if (PyUnicode_READY(left))
10411 goto error;
10412 if (PyUnicode_READY(right))
10413 goto error;
10414
Victor Stinner23e56682011-10-03 03:54:37 +020010415 if (PyUnicode_CheckExact(left) && left != unicode_empty
10416 && PyUnicode_CheckExact(right) && right != unicode_empty
10417 && unicode_resizable(left)
10418 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10419 || _PyUnicode_WSTR(left) != NULL))
10420 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010421 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10422 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010423 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010424 not so different than duplicating the string. */
10425 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010426 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010427 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010428 if (p_left != NULL)
10429 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010430 return;
10431 }
10432 }
10433
10434 res = PyUnicode_Concat(left, right);
10435 if (res == NULL)
10436 goto error;
10437 Py_DECREF(left);
10438 *p_left = res;
10439 return;
10440
10441error:
10442 Py_DECREF(*p_left);
10443 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010444}
10445
10446void
10447PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10448{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010449 PyUnicode_Append(pleft, right);
10450 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010451}
10452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010453PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010454 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010456Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010457string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010458interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459
10460static PyObject *
10461unicode_count(PyUnicodeObject *self, PyObject *args)
10462{
10463 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010464 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010465 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 int kind1, kind2, kind;
10468 void *buf1, *buf2;
10469 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470
Jesus Ceaac451502011-04-20 17:09:23 +020010471 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10472 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010473 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 kind1 = PyUnicode_KIND(self);
10476 kind2 = PyUnicode_KIND(substring);
10477 kind = kind1 > kind2 ? kind1 : kind2;
10478 buf1 = PyUnicode_DATA(self);
10479 buf2 = PyUnicode_DATA(substring);
10480 if (kind1 != kind)
10481 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10482 if (!buf1) {
10483 Py_DECREF(substring);
10484 return NULL;
10485 }
10486 if (kind2 != kind)
10487 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10488 if (!buf2) {
10489 Py_DECREF(substring);
10490 if (kind1 != kind) PyMem_Free(buf1);
10491 return NULL;
10492 }
10493 len1 = PyUnicode_GET_LENGTH(self);
10494 len2 = PyUnicode_GET_LENGTH(substring);
10495
10496 ADJUST_INDICES(start, end, len1);
10497 switch(kind) {
10498 case PyUnicode_1BYTE_KIND:
10499 iresult = ucs1lib_count(
10500 ((Py_UCS1*)buf1) + start, end - start,
10501 buf2, len2, PY_SSIZE_T_MAX
10502 );
10503 break;
10504 case PyUnicode_2BYTE_KIND:
10505 iresult = ucs2lib_count(
10506 ((Py_UCS2*)buf1) + start, end - start,
10507 buf2, len2, PY_SSIZE_T_MAX
10508 );
10509 break;
10510 case PyUnicode_4BYTE_KIND:
10511 iresult = ucs4lib_count(
10512 ((Py_UCS4*)buf1) + start, end - start,
10513 buf2, len2, PY_SSIZE_T_MAX
10514 );
10515 break;
10516 default:
10517 assert(0); iresult = 0;
10518 }
10519
10520 result = PyLong_FromSsize_t(iresult);
10521
10522 if (kind1 != kind)
10523 PyMem_Free(buf1);
10524 if (kind2 != kind)
10525 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526
10527 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010528
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529 return result;
10530}
10531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010532PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010533 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010535Encode S using the codec registered for encoding. Default encoding\n\
10536is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010537handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010538a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10539'xmlcharrefreplace' as well as any other name registered with\n\
10540codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541
10542static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010543unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010545 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546 char *encoding = NULL;
10547 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010548
Benjamin Peterson308d6372009-09-18 21:42:35 +000010549 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10550 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010552 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010553}
10554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010555PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010556 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557\n\
10558Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010559If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560
10561static PyObject*
10562unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10563{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010564 Py_ssize_t i, j, line_pos, src_len, incr;
10565 Py_UCS4 ch;
10566 PyObject *u;
10567 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010569 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010570 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571
10572 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
Antoine Pitrou22425222011-10-04 19:10:51 +020010575 if (PyUnicode_READY(self) == -1)
10576 return NULL;
10577
Thomas Wouters7e474022000-07-16 12:04:32 +000010578 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010579 src_len = PyUnicode_GET_LENGTH(self);
10580 i = j = line_pos = 0;
10581 kind = PyUnicode_KIND(self);
10582 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010583 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010584 for (; i < src_len; i++) {
10585 ch = PyUnicode_READ(kind, src_data, i);
10586 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010587 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010588 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010589 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010590 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010591 goto overflow;
10592 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010593 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010594 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010598 goto overflow;
10599 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010601 if (ch == '\n' || ch == '\r')
10602 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010604 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010605 if (!found && PyUnicode_CheckExact(self)) {
10606 Py_INCREF((PyObject *) self);
10607 return (PyObject *) self;
10608 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010609
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010611 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 if (!u)
10613 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010614 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
Antoine Pitroue71d5742011-10-04 15:55:09 +020010616 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Antoine Pitroue71d5742011-10-04 15:55:09 +020010618 for (; i < src_len; i++) {
10619 ch = PyUnicode_READ(kind, src_data, i);
10620 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010621 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010622 incr = tabsize - (line_pos % tabsize);
10623 line_pos += incr;
10624 while (incr--) {
10625 PyUnicode_WRITE(kind, dest_data, j, ' ');
10626 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010627 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010629 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010631 line_pos++;
10632 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010633 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010634 if (ch == '\n' || ch == '\r')
10635 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010637 }
10638 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010639#ifndef DONT_MAKE_RESULT_READY
10640 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 Py_DECREF(u);
10642 return NULL;
10643 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010644#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010645 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010647
Antoine Pitroue71d5742011-10-04 15:55:09 +020010648 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010649 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651}
10652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010653PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655\n\
10656Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010657such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658arguments start and end are interpreted as in slice notation.\n\
10659\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010660Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661
10662static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664{
Jesus Ceaac451502011-04-20 17:09:23 +020010665 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010666 Py_ssize_t start;
10667 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669
Jesus Ceaac451502011-04-20 17:09:23 +020010670 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10671 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 if (PyUnicode_READY(self) == -1)
10675 return NULL;
10676 if (PyUnicode_READY(substring) == -1)
10677 return NULL;
10678
Victor Stinner794d5672011-10-10 03:21:36 +020010679 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682
10683 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (result == -2)
10686 return NULL;
10687
Christian Heimes217cfd12007-12-02 14:31:20 +000010688 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689}
10690
10691static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010692unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010694 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10695 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698}
10699
Guido van Rossumc2504932007-09-18 19:42:40 +000010700/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010701 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010702static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010703unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704{
Guido van Rossumc2504932007-09-18 19:42:40 +000010705 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010706 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 if (_PyUnicode_HASH(self) != -1)
10709 return _PyUnicode_HASH(self);
10710 if (PyUnicode_READY(self) == -1)
10711 return -1;
10712 len = PyUnicode_GET_LENGTH(self);
10713
10714 /* The hash function as a macro, gets expanded three times below. */
10715#define HASH(P) \
10716 x = (Py_uhash_t)*P << 7; \
10717 while (--len >= 0) \
10718 x = (1000003*x) ^ (Py_uhash_t)*P++;
10719
10720 switch (PyUnicode_KIND(self)) {
10721 case PyUnicode_1BYTE_KIND: {
10722 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10723 HASH(c);
10724 break;
10725 }
10726 case PyUnicode_2BYTE_KIND: {
10727 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10728 HASH(s);
10729 break;
10730 }
10731 default: {
10732 Py_UCS4 *l;
10733 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10734 "Impossible switch case in unicode_hash");
10735 l = PyUnicode_4BYTE_DATA(self);
10736 HASH(l);
10737 break;
10738 }
10739 }
10740 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10741
Guido van Rossumc2504932007-09-18 19:42:40 +000010742 if (x == -1)
10743 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010745 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010749PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
10754static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010757 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010758 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010759 Py_ssize_t start;
10760 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761
Jesus Ceaac451502011-04-20 17:09:23 +020010762 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10763 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 if (PyUnicode_READY(self) == -1)
10767 return NULL;
10768 if (PyUnicode_READY(substring) == -1)
10769 return NULL;
10770
Victor Stinner794d5672011-10-10 03:21:36 +020010771 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010773 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774
10775 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (result == -2)
10778 return NULL;
10779
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 if (result < 0) {
10781 PyErr_SetString(PyExc_ValueError, "substring not found");
10782 return NULL;
10783 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010784
Christian Heimes217cfd12007-12-02 14:31:20 +000010785 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786}
10787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010788PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010791Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010792at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793
10794static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010795unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 Py_ssize_t i, length;
10798 int kind;
10799 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800 int cased;
10801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (PyUnicode_READY(self) == -1)
10803 return NULL;
10804 length = PyUnicode_GET_LENGTH(self);
10805 kind = PyUnicode_KIND(self);
10806 data = PyUnicode_DATA(self);
10807
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 if (length == 1)
10810 return PyBool_FromLong(
10811 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010813 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010815 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010816
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 for (i = 0; i < length; i++) {
10819 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010820
Benjamin Peterson29060642009-01-31 22:14:21 +000010821 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10822 return PyBool_FromLong(0);
10823 else if (!cased && Py_UNICODE_ISLOWER(ch))
10824 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010826 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827}
10828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010829PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010830 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010832Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010833at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
10835static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010836unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 Py_ssize_t i, length;
10839 int kind;
10840 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841 int cased;
10842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 if (PyUnicode_READY(self) == -1)
10844 return NULL;
10845 length = PyUnicode_GET_LENGTH(self);
10846 kind = PyUnicode_KIND(self);
10847 data = PyUnicode_DATA(self);
10848
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 if (length == 1)
10851 return PyBool_FromLong(
10852 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010854 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010857
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 for (i = 0; i < length; i++) {
10860 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010861
Benjamin Peterson29060642009-01-31 22:14:21 +000010862 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10863 return PyBool_FromLong(0);
10864 else if (!cased && Py_UNICODE_ISUPPER(ch))
10865 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010867 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868}
10869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010870PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010873Return True if S is a titlecased string and there is at least one\n\
10874character in S, i.e. upper- and titlecase characters may only\n\
10875follow uncased characters and lowercase characters only cased ones.\n\
10876Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877
10878static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010879unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 Py_ssize_t i, length;
10882 int kind;
10883 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884 int cased, previous_is_cased;
10885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 if (PyUnicode_READY(self) == -1)
10887 return NULL;
10888 length = PyUnicode_GET_LENGTH(self);
10889 kind = PyUnicode_KIND(self);
10890 data = PyUnicode_DATA(self);
10891
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 if (length == 1) {
10894 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10895 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10896 (Py_UNICODE_ISUPPER(ch) != 0));
10897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010899 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010902
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 cased = 0;
10904 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 for (i = 0; i < length; i++) {
10906 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010907
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10909 if (previous_is_cased)
10910 return PyBool_FromLong(0);
10911 previous_is_cased = 1;
10912 cased = 1;
10913 }
10914 else if (Py_UNICODE_ISLOWER(ch)) {
10915 if (!previous_is_cased)
10916 return PyBool_FromLong(0);
10917 previous_is_cased = 1;
10918 cased = 1;
10919 }
10920 else
10921 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010923 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924}
10925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010926PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010927 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010929Return True if all characters in S are whitespace\n\
10930and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
10932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010933unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 Py_ssize_t i, length;
10936 int kind;
10937 void *data;
10938
10939 if (PyUnicode_READY(self) == -1)
10940 return NULL;
10941 length = PyUnicode_GET_LENGTH(self);
10942 kind = PyUnicode_KIND(self);
10943 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 if (length == 1)
10947 return PyBool_FromLong(
10948 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010950 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010952 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 for (i = 0; i < length; i++) {
10955 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010956 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010957 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010959 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960}
10961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010962PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010963 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010964\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010965Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010966and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010967
10968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010969unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 Py_ssize_t i, length;
10972 int kind;
10973 void *data;
10974
10975 if (PyUnicode_READY(self) == -1)
10976 return NULL;
10977 length = PyUnicode_GET_LENGTH(self);
10978 kind = PyUnicode_KIND(self);
10979 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010980
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010981 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (length == 1)
10983 return PyBool_FromLong(
10984 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010985
10986 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 for (i = 0; i < length; i++) {
10991 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010994 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010995}
10996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010997PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010999\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011000Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011001and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011002
11003static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011004unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 int kind;
11007 void *data;
11008 Py_ssize_t len, i;
11009
11010 if (PyUnicode_READY(self) == -1)
11011 return NULL;
11012
11013 kind = PyUnicode_KIND(self);
11014 data = PyUnicode_DATA(self);
11015 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011016
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011017 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 if (len == 1) {
11019 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11020 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11021 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011022
11023 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 for (i = 0; i < len; i++) {
11028 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011029 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011031 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011032 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011033}
11034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011035PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011038Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011039False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
11041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011042unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 Py_ssize_t i, length;
11045 int kind;
11046 void *data;
11047
11048 if (PyUnicode_READY(self) == -1)
11049 return NULL;
11050 length = PyUnicode_GET_LENGTH(self);
11051 kind = PyUnicode_KIND(self);
11052 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 if (length == 1)
11056 return PyBool_FromLong(
11057 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011059 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011061 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 for (i = 0; i < length; i++) {
11064 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011065 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011067 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068}
11069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011070PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011071 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011073Return True if all characters in S are digits\n\
11074and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
11076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011077unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 Py_ssize_t i, length;
11080 int kind;
11081 void *data;
11082
11083 if (PyUnicode_READY(self) == -1)
11084 return NULL;
11085 length = PyUnicode_GET_LENGTH(self);
11086 kind = PyUnicode_KIND(self);
11087 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 if (length == 1) {
11091 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11092 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011095 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011097 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 for (i = 0; i < length; i++) {
11100 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011101 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011103 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104}
11105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011106PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011107 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011109Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011110False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111
11112static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011113unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 Py_ssize_t i, length;
11116 int kind;
11117 void *data;
11118
11119 if (PyUnicode_READY(self) == -1)
11120 return NULL;
11121 length = PyUnicode_GET_LENGTH(self);
11122 kind = PyUnicode_KIND(self);
11123 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 if (length == 1)
11127 return PyBool_FromLong(
11128 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011130 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011132 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 for (i = 0; i < length; i++) {
11135 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011136 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011138 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139}
11140
Martin v. Löwis47383402007-08-15 07:32:56 +000011141int
11142PyUnicode_IsIdentifier(PyObject *self)
11143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 int kind;
11145 void *data;
11146 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011147 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 if (PyUnicode_READY(self) == -1) {
11150 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011151 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 }
11153
11154 /* Special case for empty strings */
11155 if (PyUnicode_GET_LENGTH(self) == 0)
11156 return 0;
11157 kind = PyUnicode_KIND(self);
11158 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011159
11160 /* PEP 3131 says that the first character must be in
11161 XID_Start and subsequent characters in XID_Continue,
11162 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011163 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011164 letters, digits, underscore). However, given the current
11165 definition of XID_Start and XID_Continue, it is sufficient
11166 to check just for these, except that _ must be allowed
11167 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011169 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011170 return 0;
11171
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011172 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011175 return 1;
11176}
11177
11178PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011180\n\
11181Return True if S is a valid identifier according\n\
11182to the language definition.");
11183
11184static PyObject*
11185unicode_isidentifier(PyObject *self)
11186{
11187 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11188}
11189
Georg Brandl559e5d72008-06-11 18:37:52 +000011190PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011192\n\
11193Return True if all characters in S are considered\n\
11194printable in repr() or S is empty, False otherwise.");
11195
11196static PyObject*
11197unicode_isprintable(PyObject *self)
11198{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 Py_ssize_t i, length;
11200 int kind;
11201 void *data;
11202
11203 if (PyUnicode_READY(self) == -1)
11204 return NULL;
11205 length = PyUnicode_GET_LENGTH(self);
11206 kind = PyUnicode_KIND(self);
11207 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011208
11209 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 if (length == 1)
11211 return PyBool_FromLong(
11212 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 for (i = 0; i < length; i++) {
11215 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011216 Py_RETURN_FALSE;
11217 }
11218 }
11219 Py_RETURN_TRUE;
11220}
11221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011223 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224\n\
11225Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011226iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
11228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011229unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011231 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232}
11233
Martin v. Löwis18e16552006-02-15 17:27:45 +000011234static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235unicode_length(PyUnicodeObject *self)
11236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (PyUnicode_READY(self) == -1)
11238 return -1;
11239 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240}
11241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011242PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011245Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011246done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
11248static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011249unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011251 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 Py_UCS4 fillchar = ' ';
11253
11254 if (PyUnicode_READY(self) == -1)
11255 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011256
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011257 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258 return NULL;
11259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261 Py_INCREF(self);
11262 return (PyObject*) self;
11263 }
11264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266}
11267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011274unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 return fixup(self, fixlower);
11277}
11278
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011279#define LEFTSTRIP 0
11280#define RIGHTSTRIP 1
11281#define BOTHSTRIP 2
11282
11283/* Arrays indexed by above */
11284static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11285
11286#define STRIPNAME(i) (stripformat[i]+3)
11287
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011288/* externally visible for str.strip(unicode) */
11289PyObject *
11290_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 void *data;
11293 int kind;
11294 Py_ssize_t i, j, len;
11295 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11298 return NULL;
11299
11300 kind = PyUnicode_KIND(self);
11301 data = PyUnicode_DATA(self);
11302 len = PyUnicode_GET_LENGTH(self);
11303 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11304 PyUnicode_DATA(sepobj),
11305 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011306
Benjamin Peterson14339b62009-01-31 16:36:08 +000011307 i = 0;
11308 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 while (i < len &&
11310 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 i++;
11312 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011313 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011314
Benjamin Peterson14339b62009-01-31 16:36:08 +000011315 j = len;
11316 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 do {
11318 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 } while (j >= i &&
11320 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011322 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011323
Victor Stinner12bab6d2011-10-01 01:53:49 +020011324 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325}
11326
11327PyObject*
11328PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11329{
11330 unsigned char *data;
11331 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011332 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333
Victor Stinnerde636f32011-10-01 03:55:54 +020011334 if (PyUnicode_READY(self) == -1)
11335 return NULL;
11336
11337 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11338
Victor Stinner12bab6d2011-10-01 01:53:49 +020011339 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011341 if (PyUnicode_CheckExact(self)) {
11342 Py_INCREF(self);
11343 return self;
11344 }
11345 else
11346 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 }
11348
Victor Stinner12bab6d2011-10-01 01:53:49 +020011349 length = end - start;
11350 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011351 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352
Victor Stinnerde636f32011-10-01 03:55:54 +020011353 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011354 PyErr_SetString(PyExc_IndexError, "string index out of range");
11355 return NULL;
11356 }
11357
Victor Stinnerb9275c12011-10-05 14:01:42 +020011358 if (PyUnicode_IS_ASCII(self)) {
11359 kind = PyUnicode_KIND(self);
11360 data = PyUnicode_1BYTE_DATA(self);
11361 return unicode_fromascii(data + start, length);
11362 }
11363 else {
11364 kind = PyUnicode_KIND(self);
11365 data = PyUnicode_1BYTE_DATA(self);
11366 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011367 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011368 length);
11369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
11372static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011373do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 int kind;
11376 void *data;
11377 Py_ssize_t len, i, j;
11378
11379 if (PyUnicode_READY(self) == -1)
11380 return NULL;
11381
11382 kind = PyUnicode_KIND(self);
11383 data = PyUnicode_DATA(self);
11384 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011385
Benjamin Peterson14339b62009-01-31 16:36:08 +000011386 i = 0;
11387 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011389 i++;
11390 }
11391 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011392
Benjamin Peterson14339b62009-01-31 16:36:08 +000011393 j = len;
11394 if (striptype != LEFTSTRIP) {
11395 do {
11396 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011398 j++;
11399 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011400
Victor Stinner12bab6d2011-10-01 01:53:49 +020011401 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402}
11403
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011404
11405static PyObject *
11406do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11407{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011408 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011409
Benjamin Peterson14339b62009-01-31 16:36:08 +000011410 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11411 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011412
Benjamin Peterson14339b62009-01-31 16:36:08 +000011413 if (sep != NULL && sep != Py_None) {
11414 if (PyUnicode_Check(sep))
11415 return _PyUnicode_XStrip(self, striptype, sep);
11416 else {
11417 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 "%s arg must be None or str",
11419 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011420 return NULL;
11421 }
11422 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011423
Benjamin Peterson14339b62009-01-31 16:36:08 +000011424 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011425}
11426
11427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011428PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011430\n\
11431Return a copy of the string S with leading and trailing\n\
11432whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011433If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011434
11435static PyObject *
11436unicode_strip(PyUnicodeObject *self, PyObject *args)
11437{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011438 if (PyTuple_GET_SIZE(args) == 0)
11439 return do_strip(self, BOTHSTRIP); /* Common case */
11440 else
11441 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011442}
11443
11444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011445PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011447\n\
11448Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011449If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011450
11451static PyObject *
11452unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11453{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011454 if (PyTuple_GET_SIZE(args) == 0)
11455 return do_strip(self, LEFTSTRIP); /* Common case */
11456 else
11457 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011458}
11459
11460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011461PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011463\n\
11464Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011465If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011466
11467static PyObject *
11468unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11469{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011470 if (PyTuple_GET_SIZE(args) == 0)
11471 return do_strip(self, RIGHTSTRIP); /* Common case */
11472 else
11473 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011474}
11475
11476
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011478unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479{
11480 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
Georg Brandl222de0f2009-04-12 12:01:50 +000011483 if (len < 1) {
11484 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011485 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Tim Peters7a29bd52001-09-12 03:03:31 +000011488 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 /* no repeat, return original string */
11490 Py_INCREF(str);
11491 return (PyObject*) str;
11492 }
Tim Peters8f422462000-09-09 06:13:41 +000011493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (PyUnicode_READY(str) == -1)
11495 return NULL;
11496
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011497 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011498 PyErr_SetString(PyExc_OverflowError,
11499 "repeated string is too long");
11500 return NULL;
11501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 if (!u)
11506 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011507 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 if (PyUnicode_GET_LENGTH(str) == 1) {
11510 const int kind = PyUnicode_KIND(str);
11511 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11512 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011513 if (kind == PyUnicode_1BYTE_KIND)
11514 memset(to, (unsigned char)fill_char, len);
11515 else {
11516 for (n = 0; n < len; ++n)
11517 PyUnicode_WRITE(kind, to, n, fill_char);
11518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 }
11520 else {
11521 /* number of characters copied this far */
11522 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011523 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 char *to = (char *) PyUnicode_DATA(u);
11525 Py_MEMCPY(to, PyUnicode_DATA(str),
11526 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 n = (done <= nchars-done) ? done : nchars-done;
11529 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011530 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 }
11533
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011534 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 return (PyObject*) u;
11536}
11537
Alexander Belopolsky40018472011-02-26 01:02:56 +000011538PyObject *
11539PyUnicode_Replace(PyObject *obj,
11540 PyObject *subobj,
11541 PyObject *replobj,
11542 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543{
11544 PyObject *self;
11545 PyObject *str1;
11546 PyObject *str2;
11547 PyObject *result;
11548
11549 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011550 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011553 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 Py_DECREF(self);
11555 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 }
11557 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011558 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 Py_DECREF(self);
11560 Py_DECREF(str1);
11561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 Py_DECREF(self);
11565 Py_DECREF(str1);
11566 Py_DECREF(str2);
11567 return result;
11568}
11569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011570PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011571 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572\n\
11573Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011574old replaced by new. If the optional argument count is\n\
11575given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576
11577static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 PyObject *str1;
11581 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011582 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583 PyObject *result;
11584
Martin v. Löwis18e16552006-02-15 17:27:45 +000011585 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 str1 = PyUnicode_FromObject(str1);
11590 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11591 return NULL;
11592 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011593 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 Py_DECREF(str1);
11595 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597
11598 result = replace(self, str1, str2, maxcount);
11599
11600 Py_DECREF(str1);
11601 Py_DECREF(str2);
11602 return result;
11603}
11604
Alexander Belopolsky40018472011-02-26 01:02:56 +000011605static PyObject *
11606unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011608 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 Py_ssize_t isize;
11610 Py_ssize_t osize, squote, dquote, i, o;
11611 Py_UCS4 max, quote;
11612 int ikind, okind;
11613 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011616 return NULL;
11617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 isize = PyUnicode_GET_LENGTH(unicode);
11619 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 /* Compute length of output, quote characters, and
11622 maximum character */
11623 osize = 2; /* quotes */
11624 max = 127;
11625 squote = dquote = 0;
11626 ikind = PyUnicode_KIND(unicode);
11627 for (i = 0; i < isize; i++) {
11628 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11629 switch (ch) {
11630 case '\'': squote++; osize++; break;
11631 case '"': dquote++; osize++; break;
11632 case '\\': case '\t': case '\r': case '\n':
11633 osize += 2; break;
11634 default:
11635 /* Fast-path ASCII */
11636 if (ch < ' ' || ch == 0x7f)
11637 osize += 4; /* \xHH */
11638 else if (ch < 0x7f)
11639 osize++;
11640 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11641 osize++;
11642 max = ch > max ? ch : max;
11643 }
11644 else if (ch < 0x100)
11645 osize += 4; /* \xHH */
11646 else if (ch < 0x10000)
11647 osize += 6; /* \uHHHH */
11648 else
11649 osize += 10; /* \uHHHHHHHH */
11650 }
11651 }
11652
11653 quote = '\'';
11654 if (squote) {
11655 if (dquote)
11656 /* Both squote and dquote present. Use squote,
11657 and escape them */
11658 osize += squote;
11659 else
11660 quote = '"';
11661 }
11662
11663 repr = PyUnicode_New(osize, max);
11664 if (repr == NULL)
11665 return NULL;
11666 okind = PyUnicode_KIND(repr);
11667 odata = PyUnicode_DATA(repr);
11668
11669 PyUnicode_WRITE(okind, odata, 0, quote);
11670 PyUnicode_WRITE(okind, odata, osize-1, quote);
11671
11672 for (i = 0, o = 1; i < isize; i++) {
11673 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011674
11675 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 if ((ch == quote) || (ch == '\\')) {
11677 PyUnicode_WRITE(okind, odata, o++, '\\');
11678 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011679 continue;
11680 }
11681
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011683 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 PyUnicode_WRITE(okind, odata, o++, '\\');
11685 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011686 }
11687 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 PyUnicode_WRITE(okind, odata, o++, '\\');
11689 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011690 }
11691 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 PyUnicode_WRITE(okind, odata, o++, '\\');
11693 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011694 }
11695
11696 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011697 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 PyUnicode_WRITE(okind, odata, o++, '\\');
11699 PyUnicode_WRITE(okind, odata, o++, 'x');
11700 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11701 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011702 }
11703
Georg Brandl559e5d72008-06-11 18:37:52 +000011704 /* Copy ASCII characters as-is */
11705 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011707 }
11708
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011710 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011711 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011712 (categories Z* and C* except ASCII space)
11713 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011715 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 if (ch <= 0xff) {
11717 PyUnicode_WRITE(okind, odata, o++, '\\');
11718 PyUnicode_WRITE(okind, odata, o++, 'x');
11719 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11720 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011721 }
11722 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 else if (ch >= 0x10000) {
11724 PyUnicode_WRITE(okind, odata, o++, '\\');
11725 PyUnicode_WRITE(okind, odata, o++, 'U');
11726 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11727 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11728 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11729 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11730 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11731 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11732 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11733 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011734 }
11735 /* Map 16-bit characters to '\uxxxx' */
11736 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 PyUnicode_WRITE(okind, odata, o++, '\\');
11738 PyUnicode_WRITE(okind, odata, o++, 'u');
11739 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11740 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11741 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11742 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011743 }
11744 }
11745 /* Copy characters as-is */
11746 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011748 }
11749 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011752 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011753 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754}
11755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011756PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758\n\
11759Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011760such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761arguments start and end are interpreted as in slice notation.\n\
11762\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011763Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
11765static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767{
Jesus Ceaac451502011-04-20 17:09:23 +020011768 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011769 Py_ssize_t start;
11770 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011771 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Jesus Ceaac451502011-04-20 17:09:23 +020011773 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11774 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (PyUnicode_READY(self) == -1)
11778 return NULL;
11779 if (PyUnicode_READY(substring) == -1)
11780 return NULL;
11781
Victor Stinner794d5672011-10-10 03:21:36 +020011782 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011784 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
11786 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 if (result == -2)
11789 return NULL;
11790
Christian Heimes217cfd12007-12-02 14:31:20 +000011791 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792}
11793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011794PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011797Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
11799static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801{
Jesus Ceaac451502011-04-20 17:09:23 +020011802 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011803 Py_ssize_t start;
11804 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011805 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806
Jesus Ceaac451502011-04-20 17:09:23 +020011807 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11808 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 if (PyUnicode_READY(self) == -1)
11812 return NULL;
11813 if (PyUnicode_READY(substring) == -1)
11814 return NULL;
11815
Victor Stinner794d5672011-10-10 03:21:36 +020011816 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011818 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
11820 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 if (result == -2)
11823 return NULL;
11824
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825 if (result < 0) {
11826 PyErr_SetString(PyExc_ValueError, "substring not found");
11827 return NULL;
11828 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829
Christian Heimes217cfd12007-12-02 14:31:20 +000011830 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831}
11832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011833PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011836Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011837done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
11839static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011840unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011842 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 Py_UCS4 fillchar = ' ';
11844
Victor Stinnere9a29352011-10-01 02:14:59 +020011845 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011847
Victor Stinnere9a29352011-10-01 02:14:59 +020011848 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 return NULL;
11850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 Py_INCREF(self);
11853 return (PyObject*) self;
11854 }
11855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857}
11858
Alexander Belopolsky40018472011-02-26 01:02:56 +000011859PyObject *
11860PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861{
11862 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011863
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 s = PyUnicode_FromObject(s);
11865 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011866 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 if (sep != NULL) {
11868 sep = PyUnicode_FromObject(sep);
11869 if (sep == NULL) {
11870 Py_DECREF(s);
11871 return NULL;
11872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 }
11874
Victor Stinner9310abb2011-10-05 00:59:23 +020011875 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876
11877 Py_DECREF(s);
11878 Py_XDECREF(sep);
11879 return result;
11880}
11881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011882PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884\n\
11885Return a list of the words in S, using sep as the\n\
11886delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011887splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011888whitespace string is a separator and empty strings are\n\
11889removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
11891static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011892unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893{
11894 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011895 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
Martin v. Löwis18e16552006-02-15 17:27:45 +000011897 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 return NULL;
11899
11900 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011903 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906}
11907
Thomas Wouters477c8d52006-05-27 19:21:47 +000011908PyObject *
11909PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11910{
11911 PyObject* str_obj;
11912 PyObject* sep_obj;
11913 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 int kind1, kind2, kind;
11915 void *buf1 = NULL, *buf2 = NULL;
11916 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011917
11918 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011919 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011921 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011923 Py_DECREF(str_obj);
11924 return NULL;
11925 }
11926
Victor Stinner14f8f022011-10-05 20:58:25 +020011927 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011929 kind = Py_MAX(kind1, kind2);
11930 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011932 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (!buf1)
11934 goto onError;
11935 buf2 = PyUnicode_DATA(sep_obj);
11936 if (kind2 != kind)
11937 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11938 if (!buf2)
11939 goto onError;
11940 len1 = PyUnicode_GET_LENGTH(str_obj);
11941 len2 = PyUnicode_GET_LENGTH(sep_obj);
11942
Victor Stinner14f8f022011-10-05 20:58:25 +020011943 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011945 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11946 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11947 else
11948 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 break;
11950 case PyUnicode_2BYTE_KIND:
11951 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11952 break;
11953 case PyUnicode_4BYTE_KIND:
11954 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11955 break;
11956 default:
11957 assert(0);
11958 out = 0;
11959 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011960
11961 Py_DECREF(sep_obj);
11962 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (kind1 != kind)
11964 PyMem_Free(buf1);
11965 if (kind2 != kind)
11966 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011967
11968 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 onError:
11970 Py_DECREF(sep_obj);
11971 Py_DECREF(str_obj);
11972 if (kind1 != kind && buf1)
11973 PyMem_Free(buf1);
11974 if (kind2 != kind && buf2)
11975 PyMem_Free(buf2);
11976 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011977}
11978
11979
11980PyObject *
11981PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11982{
11983 PyObject* str_obj;
11984 PyObject* sep_obj;
11985 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 int kind1, kind2, kind;
11987 void *buf1 = NULL, *buf2 = NULL;
11988 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011989
11990 str_obj = PyUnicode_FromObject(str_in);
11991 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011993 sep_obj = PyUnicode_FromObject(sep_in);
11994 if (!sep_obj) {
11995 Py_DECREF(str_obj);
11996 return NULL;
11997 }
11998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 kind1 = PyUnicode_KIND(str_in);
12000 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012001 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 buf1 = PyUnicode_DATA(str_in);
12003 if (kind1 != kind)
12004 buf1 = _PyUnicode_AsKind(str_in, kind);
12005 if (!buf1)
12006 goto onError;
12007 buf2 = PyUnicode_DATA(sep_obj);
12008 if (kind2 != kind)
12009 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12010 if (!buf2)
12011 goto onError;
12012 len1 = PyUnicode_GET_LENGTH(str_obj);
12013 len2 = PyUnicode_GET_LENGTH(sep_obj);
12014
12015 switch(PyUnicode_KIND(str_in)) {
12016 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012017 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12018 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12019 else
12020 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 break;
12022 case PyUnicode_2BYTE_KIND:
12023 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12024 break;
12025 case PyUnicode_4BYTE_KIND:
12026 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12027 break;
12028 default:
12029 assert(0);
12030 out = 0;
12031 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012032
12033 Py_DECREF(sep_obj);
12034 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (kind1 != kind)
12036 PyMem_Free(buf1);
12037 if (kind2 != kind)
12038 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012039
12040 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 onError:
12042 Py_DECREF(sep_obj);
12043 Py_DECREF(str_obj);
12044 if (kind1 != kind && buf1)
12045 PyMem_Free(buf1);
12046 if (kind2 != kind && buf2)
12047 PyMem_Free(buf2);
12048 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012049}
12050
12051PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012053\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012054Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012055the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012056found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012057
12058static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012059unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012060{
Victor Stinner9310abb2011-10-05 00:59:23 +020012061 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012062}
12063
12064PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012065 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012066\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012067Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012068the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012069separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012070
12071static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012072unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012073{
Victor Stinner9310abb2011-10-05 00:59:23 +020012074 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012075}
12076
Alexander Belopolsky40018472011-02-26 01:02:56 +000012077PyObject *
12078PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012079{
12080 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012081
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012082 s = PyUnicode_FromObject(s);
12083 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012084 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 if (sep != NULL) {
12086 sep = PyUnicode_FromObject(sep);
12087 if (sep == NULL) {
12088 Py_DECREF(s);
12089 return NULL;
12090 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012091 }
12092
Victor Stinner9310abb2011-10-05 00:59:23 +020012093 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012094
12095 Py_DECREF(s);
12096 Py_XDECREF(sep);
12097 return result;
12098}
12099
12100PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012102\n\
12103Return a list of the words in S, using sep as the\n\
12104delimiter string, starting at the end of the string and\n\
12105working to the front. If maxsplit is given, at most maxsplit\n\
12106splits are done. If sep is not specified, any whitespace string\n\
12107is a separator.");
12108
12109static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012110unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012111{
12112 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012113 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012114
Martin v. Löwis18e16552006-02-15 17:27:45 +000012115 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012116 return NULL;
12117
12118 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012120 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012121 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012122 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012123 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012124}
12125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128\n\
12129Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012130Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012131is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132
12133static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012134unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012136 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012137 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012139 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12140 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141 return NULL;
12142
Guido van Rossum86662912000-04-11 15:38:46 +000012143 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144}
12145
12146static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012147PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148{
Walter Dörwald346737f2007-05-31 10:44:43 +000012149 if (PyUnicode_CheckExact(self)) {
12150 Py_INCREF(self);
12151 return self;
12152 } else
12153 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012154 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155}
12156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012157PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159\n\
12160Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012161and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162
12163static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012164unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166 return fixup(self, fixswapcase);
12167}
12168
Georg Brandlceee0772007-11-27 23:48:05 +000012169PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012171\n\
12172Return a translation table usable for str.translate().\n\
12173If there is only one argument, it must be a dictionary mapping Unicode\n\
12174ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012175Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012176If there are two arguments, they must be strings of equal length, and\n\
12177in the resulting dictionary, each character in x will be mapped to the\n\
12178character at the same position in y. If there is a third argument, it\n\
12179must be a string, whose characters will be mapped to None in the result.");
12180
12181static PyObject*
12182unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12183{
12184 PyObject *x, *y = NULL, *z = NULL;
12185 PyObject *new = NULL, *key, *value;
12186 Py_ssize_t i = 0;
12187 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012188
Georg Brandlceee0772007-11-27 23:48:05 +000012189 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12190 return NULL;
12191 new = PyDict_New();
12192 if (!new)
12193 return NULL;
12194 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 int x_kind, y_kind, z_kind;
12196 void *x_data, *y_data, *z_data;
12197
Georg Brandlceee0772007-11-27 23:48:05 +000012198 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012199 if (!PyUnicode_Check(x)) {
12200 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12201 "be a string if there is a second argument");
12202 goto err;
12203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012205 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12206 "arguments must have equal length");
12207 goto err;
12208 }
12209 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 x_kind = PyUnicode_KIND(x);
12211 y_kind = PyUnicode_KIND(y);
12212 x_data = PyUnicode_DATA(x);
12213 y_data = PyUnicode_DATA(y);
12214 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12215 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12216 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012217 if (!key || !value)
12218 goto err;
12219 res = PyDict_SetItem(new, key, value);
12220 Py_DECREF(key);
12221 Py_DECREF(value);
12222 if (res < 0)
12223 goto err;
12224 }
12225 /* create entries for deleting chars in z */
12226 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 z_kind = PyUnicode_KIND(z);
12228 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012229 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012231 if (!key)
12232 goto err;
12233 res = PyDict_SetItem(new, key, Py_None);
12234 Py_DECREF(key);
12235 if (res < 0)
12236 goto err;
12237 }
12238 }
12239 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 int kind;
12241 void *data;
12242
Georg Brandlceee0772007-11-27 23:48:05 +000012243 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012244 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012245 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12246 "to maketrans it must be a dict");
12247 goto err;
12248 }
12249 /* copy entries into the new dict, converting string keys to int keys */
12250 while (PyDict_Next(x, &i, &key, &value)) {
12251 if (PyUnicode_Check(key)) {
12252 /* convert string keys to integer keys */
12253 PyObject *newkey;
12254 if (PyUnicode_GET_SIZE(key) != 1) {
12255 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12256 "table must be of length 1");
12257 goto err;
12258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 kind = PyUnicode_KIND(key);
12260 data = PyUnicode_DATA(key);
12261 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012262 if (!newkey)
12263 goto err;
12264 res = PyDict_SetItem(new, newkey, value);
12265 Py_DECREF(newkey);
12266 if (res < 0)
12267 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012268 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012269 /* just keep integer keys */
12270 if (PyDict_SetItem(new, key, value) < 0)
12271 goto err;
12272 } else {
12273 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12274 "be strings or integers");
12275 goto err;
12276 }
12277 }
12278 }
12279 return new;
12280 err:
12281 Py_DECREF(new);
12282 return NULL;
12283}
12284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012285PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287\n\
12288Return a copy of the string S, where all characters have been mapped\n\
12289through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012290Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012291Unmapped characters are left untouched. Characters mapped to None\n\
12292are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293
12294static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298}
12299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012300PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012303Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304
12305static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012306unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308 return fixup(self, fixupper);
12309}
12310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012311PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012312 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012314Pad a numeric string S with zeros on the left, to fill a field\n\
12315of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316
12317static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012318unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012320 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012321 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012322 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 int kind;
12324 void *data;
12325 Py_UCS4 chr;
12326
12327 if (PyUnicode_READY(self) == -1)
12328 return NULL;
12329
Martin v. Löwis18e16552006-02-15 17:27:45 +000012330 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331 return NULL;
12332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012334 if (PyUnicode_CheckExact(self)) {
12335 Py_INCREF(self);
12336 return (PyObject*) self;
12337 }
12338 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012339 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340 }
12341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343
12344 u = pad(self, fill, 0, '0');
12345
Walter Dörwald068325e2002-04-15 13:36:47 +000012346 if (u == NULL)
12347 return NULL;
12348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 kind = PyUnicode_KIND(u);
12350 data = PyUnicode_DATA(u);
12351 chr = PyUnicode_READ(kind, data, fill);
12352
12353 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 PyUnicode_WRITE(kind, data, 0, chr);
12356 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357 }
12358
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012359 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360 return (PyObject*) u;
12361}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
12363#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012364static PyObject *
12365unicode__decimal2ascii(PyObject *self)
12366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012368}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369#endif
12370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012371PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012374Return True if S starts with the specified prefix, False otherwise.\n\
12375With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012376With optional end, stop comparing S at that position.\n\
12377prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378
12379static PyObject *
12380unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012381 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012383 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012385 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012386 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012387 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388
Jesus Ceaac451502011-04-20 17:09:23 +020012389 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012391 if (PyTuple_Check(subobj)) {
12392 Py_ssize_t i;
12393 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12394 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012396 if (substring == NULL)
12397 return NULL;
12398 result = tailmatch(self, substring, start, end, -1);
12399 Py_DECREF(substring);
12400 if (result) {
12401 Py_RETURN_TRUE;
12402 }
12403 }
12404 /* nothing matched */
12405 Py_RETURN_FALSE;
12406 }
12407 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012408 if (substring == NULL) {
12409 if (PyErr_ExceptionMatches(PyExc_TypeError))
12410 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12411 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012412 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012413 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012414 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012416 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417}
12418
12419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012420PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012423Return True if S ends with the specified suffix, False otherwise.\n\
12424With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012425With optional end, stop comparing S at that position.\n\
12426suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427
12428static PyObject *
12429unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012430 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012432 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012434 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012435 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012436 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437
Jesus Ceaac451502011-04-20 17:09:23 +020012438 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012440 if (PyTuple_Check(subobj)) {
12441 Py_ssize_t i;
12442 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12443 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012445 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012447 result = tailmatch(self, substring, start, end, +1);
12448 Py_DECREF(substring);
12449 if (result) {
12450 Py_RETURN_TRUE;
12451 }
12452 }
12453 Py_RETURN_FALSE;
12454 }
12455 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012456 if (substring == NULL) {
12457 if (PyErr_ExceptionMatches(PyExc_TypeError))
12458 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12459 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012461 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012462 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012464 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465}
12466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012468
12469PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012471\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012472Return a formatted version of S, using substitutions from args and kwargs.\n\
12473The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012474
Eric Smith27bbca62010-11-04 17:06:58 +000012475PyDoc_STRVAR(format_map__doc__,
12476 "S.format_map(mapping) -> str\n\
12477\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012478Return a formatted version of S, using substitutions from mapping.\n\
12479The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012480
Eric Smith4a7d76d2008-05-30 18:10:19 +000012481static PyObject *
12482unicode__format__(PyObject* self, PyObject* args)
12483{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012484 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012485
12486 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12487 return NULL;
12488
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012489 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012491 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012492}
12493
Eric Smith8c663262007-08-25 02:26:07 +000012494PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012495 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012496\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012497Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012498
12499static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012500unicode__sizeof__(PyUnicodeObject *v)
12501{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 Py_ssize_t size;
12503
12504 /* If it's a compact object, account for base structure +
12505 character data. */
12506 if (PyUnicode_IS_COMPACT_ASCII(v))
12507 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12508 else if (PyUnicode_IS_COMPACT(v))
12509 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012510 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 else {
12512 /* If it is a two-block object, account for base object, and
12513 for character block if present. */
12514 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012515 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012517 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 }
12519 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012520 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012521 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012523 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012524 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525
12526 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012527}
12528
12529PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012531
12532static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012533unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012534{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012535 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 if (!copy)
12537 return NULL;
12538 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012539}
12540
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541static PyMethodDef unicode_methods[] = {
12542
12543 /* Order is according to common usage: often used methods should
12544 appear first, since lookup is done sequentially. */
12545
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012546 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012547 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12548 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012549 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012550 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12551 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12552 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12553 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12554 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12555 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12556 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012557 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012558 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12559 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12560 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012561 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012562 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12563 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12564 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012565 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012566 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012567 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012568 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012569 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12570 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12571 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12572 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12573 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12574 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12575 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12576 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12577 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12578 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12579 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12580 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12581 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12582 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012583 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012584 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012585 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012586 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012587 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012588 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012589 {"maketrans", (PyCFunction) unicode_maketrans,
12590 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012591 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012592#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012593 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594#endif
12595
12596#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012597 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012598 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599#endif
12600
Benjamin Peterson14339b62009-01-31 16:36:08 +000012601 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602 {NULL, NULL}
12603};
12604
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012605static PyObject *
12606unicode_mod(PyObject *v, PyObject *w)
12607{
Brian Curtindfc80e32011-08-10 20:28:54 -050012608 if (!PyUnicode_Check(v))
12609 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012611}
12612
12613static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012614 0, /*nb_add*/
12615 0, /*nb_subtract*/
12616 0, /*nb_multiply*/
12617 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012618};
12619
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012621 (lenfunc) unicode_length, /* sq_length */
12622 PyUnicode_Concat, /* sq_concat */
12623 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12624 (ssizeargfunc) unicode_getitem, /* sq_item */
12625 0, /* sq_slice */
12626 0, /* sq_ass_item */
12627 0, /* sq_ass_slice */
12628 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629};
12630
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012631static PyObject*
12632unicode_subscript(PyUnicodeObject* self, PyObject* item)
12633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 if (PyUnicode_READY(self) == -1)
12635 return NULL;
12636
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012637 if (PyIndex_Check(item)) {
12638 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012639 if (i == -1 && PyErr_Occurred())
12640 return NULL;
12641 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012643 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012644 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012645 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012646 PyObject *result;
12647 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012648 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012649 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012653 return NULL;
12654 }
12655
12656 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 return PyUnicode_New(0, 0);
12658 } else if (start == 0 && step == 1 &&
12659 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012660 PyUnicode_CheckExact(self)) {
12661 Py_INCREF(self);
12662 return (PyObject *)self;
12663 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012664 return PyUnicode_Substring((PyObject*)self,
12665 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012666 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012667 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012668 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012669 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012670 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012671 src_data = PyUnicode_DATA(self);
12672 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12673 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012674 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012675 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012676 if (max_char >= kind_limit)
12677 break;
12678 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012679 }
12680 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012681 if (result == NULL)
12682 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012683 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012684 dest_data = PyUnicode_DATA(result);
12685
12686 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012687 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12688 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012689 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012690 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012691 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012692 } else {
12693 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12694 return NULL;
12695 }
12696}
12697
12698static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 (lenfunc)unicode_length, /* mp_length */
12700 (binaryfunc)unicode_subscript, /* mp_subscript */
12701 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012702};
12703
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705/* Helpers for PyUnicode_Format() */
12706
12707static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012708getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012710 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012712 (*p_argidx)++;
12713 if (arglen < 0)
12714 return args;
12715 else
12716 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717 }
12718 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 return NULL;
12721}
12722
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012723/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012725static PyObject *
12726formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012728 char *p;
12729 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012731
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732 x = PyFloat_AsDouble(v);
12733 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012734 return NULL;
12735
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012738
Eric Smith0923d1d2009-04-16 20:16:10 +000012739 p = PyOS_double_to_string(x, type, prec,
12740 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012741 if (p == NULL)
12742 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012744 PyMem_Free(p);
12745 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746}
12747
Tim Peters38fd5b62000-09-21 05:43:11 +000012748static PyObject*
12749formatlong(PyObject *val, int flags, int prec, int type)
12750{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012751 char *buf;
12752 int len;
12753 PyObject *str; /* temporary string object. */
12754 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012755
Benjamin Peterson14339b62009-01-31 16:36:08 +000012756 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12757 if (!str)
12758 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 Py_DECREF(str);
12761 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012762}
12763
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012764static Py_UCS4
12765formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012767 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012768 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012770 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012771 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 goto onError;
12773 }
12774 else {
12775 /* Integer input truncated to a character */
12776 long x;
12777 x = PyLong_AsLong(v);
12778 if (x == -1 && PyErr_Occurred())
12779 goto onError;
12780
12781 if (x < 0 || x > 0x10ffff) {
12782 PyErr_SetString(PyExc_OverflowError,
12783 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012784 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012785 }
12786
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012787 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012788 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012789
Benjamin Peterson29060642009-01-31 22:14:21 +000012790 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012791 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012793 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794}
12795
Antoine Pitrou978b9d22011-10-07 12:35:48 +020012796static int
12797repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12798{
12799 int r;
12800 assert(count > 0);
12801 assert(PyUnicode_Check(obj));
12802 if (count > 5) {
12803 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
12804 if (repeated == NULL)
12805 return -1;
12806 r = _PyAccu_Accumulate(acc, repeated);
12807 Py_DECREF(repeated);
12808 return r;
12809 }
12810 else {
12811 do {
12812 if (_PyAccu_Accumulate(acc, obj))
12813 return -1;
12814 } while (--count);
12815 return 0;
12816 }
12817}
12818
Alexander Belopolsky40018472011-02-26 01:02:56 +000012819PyObject *
12820PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 void *fmt;
12823 int fmtkind;
12824 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012826 int r;
12827 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012830 PyObject *temp = NULL;
12831 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012833 _PyAccu acc;
12834 static PyObject *plus, *minus, *blank, *zero, *percent;
12835
12836 if (!plus && !(plus = get_latin1_char('+')))
12837 return NULL;
12838 if (!minus && !(minus = get_latin1_char('-')))
12839 return NULL;
12840 if (!blank && !(blank = get_latin1_char(' ')))
12841 return NULL;
12842 if (!zero && !(zero = get_latin1_char('0')))
12843 return NULL;
12844 if (!percent && !(percent = get_latin1_char('%')))
12845 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000012846
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 PyErr_BadInternalCall();
12849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12852 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012854 if (_PyAccu_Init(&acc))
12855 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 fmt = PyUnicode_DATA(uformat);
12857 fmtkind = PyUnicode_KIND(uformat);
12858 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12859 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 arglen = PyTuple_Size(args);
12863 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864 }
12865 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 arglen = -1;
12867 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012869 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012870 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872
12873 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012875 PyObject *nonfmt;
12876 Py_ssize_t nonfmtpos;
12877 nonfmtpos = fmtpos++;
12878 while (fmtcnt >= 0 &&
12879 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12880 fmtpos++;
12881 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012882 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012883 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12884 if (nonfmt == NULL)
12885 goto onError;
12886 r = _PyAccu_Accumulate(&acc, nonfmt);
12887 Py_DECREF(nonfmt);
12888 if (r)
12889 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012890 }
12891 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 /* Got a format specifier */
12893 int flags = 0;
12894 Py_ssize_t width = -1;
12895 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012897 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000012898 int isnumok;
12899 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012900 void *pbuf = NULL;
12901 Py_ssize_t pindex, len;
12902 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 fmtpos++;
12905 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12906 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 Py_ssize_t keylen;
12908 PyObject *key;
12909 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012910
Benjamin Peterson29060642009-01-31 22:14:21 +000012911 if (dict == NULL) {
12912 PyErr_SetString(PyExc_TypeError,
12913 "format requires a mapping");
12914 goto onError;
12915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012917 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012919 /* Skip over balanced parentheses */
12920 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012922 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012924 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 if (fmtcnt < 0 || pcount > 0) {
12929 PyErr_SetString(PyExc_ValueError,
12930 "incomplete format key");
12931 goto onError;
12932 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012933 key = PyUnicode_Substring((PyObject*)uformat,
12934 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012935 if (key == NULL)
12936 goto onError;
12937 if (args_owned) {
12938 Py_DECREF(args);
12939 args_owned = 0;
12940 }
12941 args = PyObject_GetItem(dict, key);
12942 Py_DECREF(key);
12943 if (args == NULL) {
12944 goto onError;
12945 }
12946 args_owned = 1;
12947 arglen = -1;
12948 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012949 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012952 case '-': flags |= F_LJUST; continue;
12953 case '+': flags |= F_SIGN; continue;
12954 case ' ': flags |= F_BLANK; continue;
12955 case '#': flags |= F_ALT; continue;
12956 case '0': flags |= F_ZERO; continue;
12957 }
12958 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012959 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 if (c == '*') {
12961 v = getnextarg(args, arglen, &argidx);
12962 if (v == NULL)
12963 goto onError;
12964 if (!PyLong_Check(v)) {
12965 PyErr_SetString(PyExc_TypeError,
12966 "* wants int");
12967 goto onError;
12968 }
12969 width = PyLong_AsLong(v);
12970 if (width == -1 && PyErr_Occurred())
12971 goto onError;
12972 if (width < 0) {
12973 flags |= F_LJUST;
12974 width = -width;
12975 }
12976 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012978 }
12979 else if (c >= '0' && c <= '9') {
12980 width = c - '0';
12981 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012983 if (c < '0' || c > '9')
12984 break;
12985 if ((width*10) / 10 != width) {
12986 PyErr_SetString(PyExc_ValueError,
12987 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012988 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 }
12990 width = width*10 + (c - '0');
12991 }
12992 }
12993 if (c == '.') {
12994 prec = 0;
12995 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 if (c == '*') {
12998 v = getnextarg(args, arglen, &argidx);
12999 if (v == NULL)
13000 goto onError;
13001 if (!PyLong_Check(v)) {
13002 PyErr_SetString(PyExc_TypeError,
13003 "* wants int");
13004 goto onError;
13005 }
13006 prec = PyLong_AsLong(v);
13007 if (prec == -1 && PyErr_Occurred())
13008 goto onError;
13009 if (prec < 0)
13010 prec = 0;
13011 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 }
13014 else if (c >= '0' && c <= '9') {
13015 prec = c - '0';
13016 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013018 if (c < '0' || c > '9')
13019 break;
13020 if ((prec*10) / 10 != prec) {
13021 PyErr_SetString(PyExc_ValueError,
13022 "prec too big");
13023 goto onError;
13024 }
13025 prec = prec*10 + (c - '0');
13026 }
13027 }
13028 } /* prec */
13029 if (fmtcnt >= 0) {
13030 if (c == 'h' || c == 'l' || c == 'L') {
13031 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013033 }
13034 }
13035 if (fmtcnt < 0) {
13036 PyErr_SetString(PyExc_ValueError,
13037 "incomplete format");
13038 goto onError;
13039 }
13040 if (c != '%') {
13041 v = getnextarg(args, arglen, &argidx);
13042 if (v == NULL)
13043 goto onError;
13044 }
13045 sign = 0;
13046 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013047 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013048 switch (c) {
13049
13050 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013051 _PyAccu_Accumulate(&acc, percent);
13052 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013053
13054 case 's':
13055 case 'r':
13056 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013057 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 temp = v;
13059 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013060 }
13061 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 if (c == 's')
13063 temp = PyObject_Str(v);
13064 else if (c == 'r')
13065 temp = PyObject_Repr(v);
13066 else
13067 temp = PyObject_ASCII(v);
13068 if (temp == NULL)
13069 goto onError;
13070 if (PyUnicode_Check(temp))
13071 /* nothing to do */;
13072 else {
13073 Py_DECREF(temp);
13074 PyErr_SetString(PyExc_TypeError,
13075 "%s argument has non-string str()");
13076 goto onError;
13077 }
13078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013079 if (PyUnicode_READY(temp) == -1) {
13080 Py_CLEAR(temp);
13081 goto onError;
13082 }
13083 pbuf = PyUnicode_DATA(temp);
13084 kind = PyUnicode_KIND(temp);
13085 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 if (prec >= 0 && len > prec)
13087 len = prec;
13088 break;
13089
13090 case 'i':
13091 case 'd':
13092 case 'u':
13093 case 'o':
13094 case 'x':
13095 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013096 isnumok = 0;
13097 if (PyNumber_Check(v)) {
13098 PyObject *iobj=NULL;
13099
13100 if (PyLong_Check(v)) {
13101 iobj = v;
13102 Py_INCREF(iobj);
13103 }
13104 else {
13105 iobj = PyNumber_Long(v);
13106 }
13107 if (iobj!=NULL) {
13108 if (PyLong_Check(iobj)) {
13109 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013110 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 Py_DECREF(iobj);
13112 if (!temp)
13113 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 if (PyUnicode_READY(temp) == -1) {
13115 Py_CLEAR(temp);
13116 goto onError;
13117 }
13118 pbuf = PyUnicode_DATA(temp);
13119 kind = PyUnicode_KIND(temp);
13120 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013121 sign = 1;
13122 }
13123 else {
13124 Py_DECREF(iobj);
13125 }
13126 }
13127 }
13128 if (!isnumok) {
13129 PyErr_Format(PyExc_TypeError,
13130 "%%%c format: a number is required, "
13131 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13132 goto onError;
13133 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013134 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013136 fillobj = zero;
13137 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 break;
13139
13140 case 'e':
13141 case 'E':
13142 case 'f':
13143 case 'F':
13144 case 'g':
13145 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013146 temp = formatfloat(v, flags, prec, c);
13147 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 if (PyUnicode_READY(temp) == -1) {
13150 Py_CLEAR(temp);
13151 goto onError;
13152 }
13153 pbuf = PyUnicode_DATA(temp);
13154 kind = PyUnicode_KIND(temp);
13155 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013157 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013159 fillobj = zero;
13160 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 break;
13162
13163 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013164 {
13165 Py_UCS4 ch = formatchar(v);
13166 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013168 temp = _PyUnicode_FromUCS4(&ch, 1);
13169 if (temp == NULL)
13170 goto onError;
13171 pbuf = PyUnicode_DATA(temp);
13172 kind = PyUnicode_KIND(temp);
13173 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013175 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013176
13177 default:
13178 PyErr_Format(PyExc_ValueError,
13179 "unsupported format character '%c' (0x%x) "
13180 "at index %zd",
13181 (31<=c && c<=126) ? (char)c : '?',
13182 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 goto onError;
13185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186 /* pbuf is initialized here. */
13187 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013189 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13190 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013192 pindex++;
13193 }
13194 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13195 signobj = plus;
13196 len--;
13197 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013198 }
13199 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013200 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013202 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013203 else
13204 sign = 0;
13205 }
13206 if (width < len)
13207 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013208 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013209 if (fill != ' ') {
13210 assert(signobj != NULL);
13211 if (_PyAccu_Accumulate(&acc, signobj))
13212 goto onError;
13213 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013214 if (width > len)
13215 width--;
13216 }
13217 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013219 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013221 second = get_latin1_char(
13222 PyUnicode_READ(kind, pbuf, pindex + 1));
13223 pindex += 2;
13224 if (second == NULL ||
13225 _PyAccu_Accumulate(&acc, zero) ||
13226 _PyAccu_Accumulate(&acc, second))
13227 goto onError;
13228 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013230 width -= 2;
13231 if (width < 0)
13232 width = 0;
13233 len -= 2;
13234 }
13235 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013236 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013237 if (repeat_accumulate(&acc, fillobj, width - len))
13238 goto onError;
13239 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013240 }
13241 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013242 if (sign) {
13243 assert(signobj != NULL);
13244 if (_PyAccu_Accumulate(&acc, signobj))
13245 goto onError;
13246 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13249 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013250 second = get_latin1_char(
13251 PyUnicode_READ(kind, pbuf, pindex + 1));
13252 pindex += 2;
13253 if (second == NULL ||
13254 _PyAccu_Accumulate(&acc, zero) ||
13255 _PyAccu_Accumulate(&acc, second))
13256 goto onError;
13257 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 }
13259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013261 if (temp != NULL) {
13262 assert(pbuf == PyUnicode_DATA(temp));
13263 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013265 else {
13266 const char *p = (const char *) pbuf;
13267 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013268 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013269 v = PyUnicode_FromKindAndData(kind, p, len);
13270 }
13271 if (v == NULL)
13272 goto onError;
13273 r = _PyAccu_Accumulate(&acc, v);
13274 Py_DECREF(v);
13275 if (r)
13276 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013277 if (width > len && repeat_accumulate(&acc, blank, width - len))
13278 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013279 if (dict && (argidx < arglen) && c != '%') {
13280 PyErr_SetString(PyExc_TypeError,
13281 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 goto onError;
13283 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013284 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013285 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286 } /* until end */
13287 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 PyErr_SetString(PyExc_TypeError,
13289 "not all arguments converted during string formatting");
13290 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291 }
13292
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013293 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296 }
13297 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013298 Py_XDECREF(temp);
13299 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300 return (PyObject *)result;
13301
Benjamin Peterson29060642009-01-31 22:14:21 +000013302 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013304 Py_XDECREF(temp);
13305 Py_XDECREF(second);
13306 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309 }
13310 return NULL;
13311}
13312
Jeremy Hylton938ace62002-07-17 16:30:39 +000013313static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013314unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13315
Tim Peters6d6c1a32001-08-02 04:15:00 +000013316static PyObject *
13317unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13318{
Benjamin Peterson29060642009-01-31 22:14:21 +000013319 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013320 static char *kwlist[] = {"object", "encoding", "errors", 0};
13321 char *encoding = NULL;
13322 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013323
Benjamin Peterson14339b62009-01-31 16:36:08 +000013324 if (type != &PyUnicode_Type)
13325 return unicode_subtype_new(type, args, kwds);
13326 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013327 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013328 return NULL;
13329 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013331 if (encoding == NULL && errors == NULL)
13332 return PyObject_Str(x);
13333 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013335}
13336
Guido van Rossume023fe02001-08-30 03:12:59 +000013337static PyObject *
13338unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13339{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013340 PyUnicodeObject *unicode, *self;
13341 Py_ssize_t length, char_size;
13342 int share_wstr, share_utf8;
13343 unsigned int kind;
13344 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013345
Benjamin Peterson14339b62009-01-31 16:36:08 +000013346 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013347
13348 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13349 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013350 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013351 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013352 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013353 return NULL;
13354
13355 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13356 if (self == NULL) {
13357 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013358 return NULL;
13359 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013360 kind = PyUnicode_KIND(unicode);
13361 length = PyUnicode_GET_LENGTH(unicode);
13362
13363 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013364#ifdef Py_DEBUG
13365 _PyUnicode_HASH(self) = -1;
13366#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013367 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013368#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013369 _PyUnicode_STATE(self).interned = 0;
13370 _PyUnicode_STATE(self).kind = kind;
13371 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013372 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013373 _PyUnicode_STATE(self).ready = 1;
13374 _PyUnicode_WSTR(self) = NULL;
13375 _PyUnicode_UTF8_LENGTH(self) = 0;
13376 _PyUnicode_UTF8(self) = NULL;
13377 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013378 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013379
13380 share_utf8 = 0;
13381 share_wstr = 0;
13382 if (kind == PyUnicode_1BYTE_KIND) {
13383 char_size = 1;
13384 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13385 share_utf8 = 1;
13386 }
13387 else if (kind == PyUnicode_2BYTE_KIND) {
13388 char_size = 2;
13389 if (sizeof(wchar_t) == 2)
13390 share_wstr = 1;
13391 }
13392 else {
13393 assert(kind == PyUnicode_4BYTE_KIND);
13394 char_size = 4;
13395 if (sizeof(wchar_t) == 4)
13396 share_wstr = 1;
13397 }
13398
13399 /* Ensure we won't overflow the length. */
13400 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13401 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013402 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013403 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013404 data = PyObject_MALLOC((length + 1) * char_size);
13405 if (data == NULL) {
13406 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 goto onError;
13408 }
13409
Victor Stinnerc3c74152011-10-02 20:39:55 +020013410 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013411 if (share_utf8) {
13412 _PyUnicode_UTF8_LENGTH(self) = length;
13413 _PyUnicode_UTF8(self) = data;
13414 }
13415 if (share_wstr) {
13416 _PyUnicode_WSTR_LENGTH(self) = length;
13417 _PyUnicode_WSTR(self) = (wchar_t *)data;
13418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013420 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013421 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013422 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013423 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013424#ifdef Py_DEBUG
13425 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13426#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013427 return (PyObject *)self;
13428
13429onError:
13430 Py_DECREF(unicode);
13431 Py_DECREF(self);
13432 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013433}
13434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013435PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013437\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013438Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013439encoding defaults to the current default string encoding.\n\
13440errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013441
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013442static PyObject *unicode_iter(PyObject *seq);
13443
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013445 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013446 "str", /* tp_name */
13447 sizeof(PyUnicodeObject), /* tp_size */
13448 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013450 (destructor)unicode_dealloc, /* tp_dealloc */
13451 0, /* tp_print */
13452 0, /* tp_getattr */
13453 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013454 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013455 unicode_repr, /* tp_repr */
13456 &unicode_as_number, /* tp_as_number */
13457 &unicode_as_sequence, /* tp_as_sequence */
13458 &unicode_as_mapping, /* tp_as_mapping */
13459 (hashfunc) unicode_hash, /* tp_hash*/
13460 0, /* tp_call*/
13461 (reprfunc) unicode_str, /* tp_str */
13462 PyObject_GenericGetAttr, /* tp_getattro */
13463 0, /* tp_setattro */
13464 0, /* tp_as_buffer */
13465 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013466 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013467 unicode_doc, /* tp_doc */
13468 0, /* tp_traverse */
13469 0, /* tp_clear */
13470 PyUnicode_RichCompare, /* tp_richcompare */
13471 0, /* tp_weaklistoffset */
13472 unicode_iter, /* tp_iter */
13473 0, /* tp_iternext */
13474 unicode_methods, /* tp_methods */
13475 0, /* tp_members */
13476 0, /* tp_getset */
13477 &PyBaseObject_Type, /* tp_base */
13478 0, /* tp_dict */
13479 0, /* tp_descr_get */
13480 0, /* tp_descr_set */
13481 0, /* tp_dictoffset */
13482 0, /* tp_init */
13483 0, /* tp_alloc */
13484 unicode_new, /* tp_new */
13485 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013486};
13487
13488/* Initialize the Unicode implementation */
13489
Thomas Wouters78890102000-07-22 19:25:51 +000013490void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013491{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013492 int i;
13493
Thomas Wouters477c8d52006-05-27 19:21:47 +000013494 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013496 0x000A, /* LINE FEED */
13497 0x000D, /* CARRIAGE RETURN */
13498 0x001C, /* FILE SEPARATOR */
13499 0x001D, /* GROUP SEPARATOR */
13500 0x001E, /* RECORD SEPARATOR */
13501 0x0085, /* NEXT LINE */
13502 0x2028, /* LINE SEPARATOR */
13503 0x2029, /* PARAGRAPH SEPARATOR */
13504 };
13505
Fred Drakee4315f52000-05-09 19:53:39 +000013506 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013507 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013508 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013509 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013510 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013511
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013512 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013514 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013516
13517 /* initialize the linebreak bloom filter */
13518 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013520 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013521
13522 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013523}
13524
13525/* Finalize the Unicode implementation */
13526
Christian Heimesa156e092008-02-16 07:38:31 +000013527int
13528PyUnicode_ClearFreeList(void)
13529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013530 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013531}
13532
Guido van Rossumd57fd912000-03-10 22:53:23 +000013533void
Thomas Wouters78890102000-07-22 19:25:51 +000013534_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013535{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013536 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013537
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013538 Py_XDECREF(unicode_empty);
13539 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013540
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013541 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 if (unicode_latin1[i]) {
13543 Py_DECREF(unicode_latin1[i]);
13544 unicode_latin1[i] = NULL;
13545 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013546 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013547 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013548 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013549}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013550
Walter Dörwald16807132007-05-25 13:52:07 +000013551void
13552PyUnicode_InternInPlace(PyObject **p)
13553{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013554 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13555 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013556#ifdef Py_DEBUG
13557 assert(s != NULL);
13558 assert(_PyUnicode_CHECK(s));
13559#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013560 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013561 return;
13562#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013563 /* If it's a subclass, we don't really know what putting
13564 it in the interned dict might do. */
13565 if (!PyUnicode_CheckExact(s))
13566 return;
13567 if (PyUnicode_CHECK_INTERNED(s))
13568 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013569 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013570 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013571 return;
13572 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013573 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013574 if (interned == NULL) {
13575 interned = PyDict_New();
13576 if (interned == NULL) {
13577 PyErr_Clear(); /* Don't leave an exception */
13578 return;
13579 }
13580 }
13581 /* It might be that the GetItem call fails even
13582 though the key is present in the dictionary,
13583 namely when this happens during a stack overflow. */
13584 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013586 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013587
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 if (t) {
13589 Py_INCREF(t);
13590 Py_DECREF(*p);
13591 *p = t;
13592 return;
13593 }
Walter Dörwald16807132007-05-25 13:52:07 +000013594
Benjamin Peterson14339b62009-01-31 16:36:08 +000013595 PyThreadState_GET()->recursion_critical = 1;
13596 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13597 PyErr_Clear();
13598 PyThreadState_GET()->recursion_critical = 0;
13599 return;
13600 }
13601 PyThreadState_GET()->recursion_critical = 0;
13602 /* The two references in interned are not counted by refcnt.
13603 The deallocator will take care of this */
13604 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013606}
13607
13608void
13609PyUnicode_InternImmortal(PyObject **p)
13610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13612
Benjamin Peterson14339b62009-01-31 16:36:08 +000013613 PyUnicode_InternInPlace(p);
13614 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013615 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013616 Py_INCREF(*p);
13617 }
Walter Dörwald16807132007-05-25 13:52:07 +000013618}
13619
13620PyObject *
13621PyUnicode_InternFromString(const char *cp)
13622{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013623 PyObject *s = PyUnicode_FromString(cp);
13624 if (s == NULL)
13625 return NULL;
13626 PyUnicode_InternInPlace(&s);
13627 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013628}
13629
Alexander Belopolsky40018472011-02-26 01:02:56 +000013630void
13631_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013632{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013633 PyObject *keys;
13634 PyUnicodeObject *s;
13635 Py_ssize_t i, n;
13636 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013637
Benjamin Peterson14339b62009-01-31 16:36:08 +000013638 if (interned == NULL || !PyDict_Check(interned))
13639 return;
13640 keys = PyDict_Keys(interned);
13641 if (keys == NULL || !PyList_Check(keys)) {
13642 PyErr_Clear();
13643 return;
13644 }
Walter Dörwald16807132007-05-25 13:52:07 +000013645
Benjamin Peterson14339b62009-01-31 16:36:08 +000013646 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13647 detector, interned unicode strings are not forcibly deallocated;
13648 rather, we give them their stolen references back, and then clear
13649 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013650
Benjamin Peterson14339b62009-01-31 16:36:08 +000013651 n = PyList_GET_SIZE(keys);
13652 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013654 for (i = 0; i < n; i++) {
13655 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013656 if (PyUnicode_READY(s) == -1) {
13657 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013658 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013660 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013661 case SSTATE_NOT_INTERNED:
13662 /* XXX Shouldn't happen */
13663 break;
13664 case SSTATE_INTERNED_IMMORTAL:
13665 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013666 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013667 break;
13668 case SSTATE_INTERNED_MORTAL:
13669 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013671 break;
13672 default:
13673 Py_FatalError("Inconsistent interned string state.");
13674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013676 }
13677 fprintf(stderr, "total size of all interned strings: "
13678 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13679 "mortal/immortal\n", mortal_size, immortal_size);
13680 Py_DECREF(keys);
13681 PyDict_Clear(interned);
13682 Py_DECREF(interned);
13683 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013684}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013685
13686
13687/********************* Unicode Iterator **************************/
13688
13689typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013690 PyObject_HEAD
13691 Py_ssize_t it_index;
13692 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013693} unicodeiterobject;
13694
13695static void
13696unicodeiter_dealloc(unicodeiterobject *it)
13697{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013698 _PyObject_GC_UNTRACK(it);
13699 Py_XDECREF(it->it_seq);
13700 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013701}
13702
13703static int
13704unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13705{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013706 Py_VISIT(it->it_seq);
13707 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013708}
13709
13710static PyObject *
13711unicodeiter_next(unicodeiterobject *it)
13712{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013713 PyUnicodeObject *seq;
13714 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013715
Benjamin Peterson14339b62009-01-31 16:36:08 +000013716 assert(it != NULL);
13717 seq = it->it_seq;
13718 if (seq == NULL)
13719 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013720 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013722 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13723 int kind = PyUnicode_KIND(seq);
13724 void *data = PyUnicode_DATA(seq);
13725 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13726 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013727 if (item != NULL)
13728 ++it->it_index;
13729 return item;
13730 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013731
Benjamin Peterson14339b62009-01-31 16:36:08 +000013732 Py_DECREF(seq);
13733 it->it_seq = NULL;
13734 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013735}
13736
13737static PyObject *
13738unicodeiter_len(unicodeiterobject *it)
13739{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013740 Py_ssize_t len = 0;
13741 if (it->it_seq)
13742 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13743 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013744}
13745
13746PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13747
13748static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013749 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013750 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013751 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013752};
13753
13754PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013755 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13756 "str_iterator", /* tp_name */
13757 sizeof(unicodeiterobject), /* tp_basicsize */
13758 0, /* tp_itemsize */
13759 /* methods */
13760 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13761 0, /* tp_print */
13762 0, /* tp_getattr */
13763 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013764 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013765 0, /* tp_repr */
13766 0, /* tp_as_number */
13767 0, /* tp_as_sequence */
13768 0, /* tp_as_mapping */
13769 0, /* tp_hash */
13770 0, /* tp_call */
13771 0, /* tp_str */
13772 PyObject_GenericGetAttr, /* tp_getattro */
13773 0, /* tp_setattro */
13774 0, /* tp_as_buffer */
13775 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13776 0, /* tp_doc */
13777 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13778 0, /* tp_clear */
13779 0, /* tp_richcompare */
13780 0, /* tp_weaklistoffset */
13781 PyObject_SelfIter, /* tp_iter */
13782 (iternextfunc)unicodeiter_next, /* tp_iternext */
13783 unicodeiter_methods, /* tp_methods */
13784 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013785};
13786
13787static PyObject *
13788unicode_iter(PyObject *seq)
13789{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013790 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013791
Benjamin Peterson14339b62009-01-31 16:36:08 +000013792 if (!PyUnicode_Check(seq)) {
13793 PyErr_BadInternalCall();
13794 return NULL;
13795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 if (PyUnicode_READY(seq) == -1)
13797 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013798 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13799 if (it == NULL)
13800 return NULL;
13801 it->it_index = 0;
13802 Py_INCREF(seq);
13803 it->it_seq = (PyUnicodeObject *)seq;
13804 _PyObject_GC_TRACK(it);
13805 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013806}
13807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808#define UNIOP(x) Py_UNICODE_##x
13809#define UNIOP_t Py_UNICODE
13810#include "uniops.h"
13811#undef UNIOP
13812#undef UNIOP_t
13813#define UNIOP(x) Py_UCS4_##x
13814#define UNIOP_t Py_UCS4
13815#include "uniops.h"
13816#undef UNIOP
13817#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013818
Victor Stinner71133ff2010-09-01 23:43:53 +000013819Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013820PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013821{
13822 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13823 Py_UNICODE *copy;
13824 Py_ssize_t size;
13825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013826 if (!PyUnicode_Check(unicode)) {
13827 PyErr_BadArgument();
13828 return NULL;
13829 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013830 /* Ensure we won't overflow the size. */
13831 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13832 PyErr_NoMemory();
13833 return NULL;
13834 }
13835 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13836 size *= sizeof(Py_UNICODE);
13837 copy = PyMem_Malloc(size);
13838 if (copy == NULL) {
13839 PyErr_NoMemory();
13840 return NULL;
13841 }
13842 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13843 return copy;
13844}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013845
Georg Brandl66c221e2010-10-14 07:04:07 +000013846/* A _string module, to export formatter_parser and formatter_field_name_split
13847 to the string.Formatter class implemented in Python. */
13848
13849static PyMethodDef _string_methods[] = {
13850 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13851 METH_O, PyDoc_STR("split the argument as a field name")},
13852 {"formatter_parser", (PyCFunction) formatter_parser,
13853 METH_O, PyDoc_STR("parse the argument as a format string")},
13854 {NULL, NULL}
13855};
13856
13857static struct PyModuleDef _string_module = {
13858 PyModuleDef_HEAD_INIT,
13859 "_string",
13860 PyDoc_STR("string helper module"),
13861 0,
13862 _string_methods,
13863 NULL,
13864 NULL,
13865 NULL,
13866 NULL
13867};
13868
13869PyMODINIT_FUNC
13870PyInit__string(void)
13871{
13872 return PyModule_Create(&_string_module);
13873}
13874
13875
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013876#ifdef __cplusplus
13877}
13878#endif