blob: af923685155f5387dd5c3ed481c9826bbfe437d2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200204/* List of static strings. */
205static _Py_Identifier *static_strings;
206
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207/* Single character Unicode strings in the Latin-1 range are being
208 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200209static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210
Christian Heimes190d79e2008-01-30 11:58:22 +0000211/* Fast detection of the most frequent whitespace characters */
212const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000214/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000215/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000216/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000217/* case 0x000C: * FORM FEED */
218/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 1, 1, 1, 1, 1, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x001C: * FILE SEPARATOR */
222/* case 0x001D: * GROUP SEPARATOR */
223/* case 0x001E: * RECORD SEPARATOR */
224/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 1, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000231
Benjamin Peterson14339b62009-01-31 16:36:08 +0000232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000240};
241
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200242/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200243static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200244static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200245static void copy_characters(
246 PyObject *to, Py_ssize_t to_start,
247 PyObject *from, Py_ssize_t from_start,
248 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200249#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200250static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200251#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253static PyObject *
254unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000255 PyObject **errorHandler,const char *encoding, const char *reason,
256 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259static void
260raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300261 const char *encoding,
262 const Py_UNICODE *unicode, Py_ssize_t size,
263 Py_ssize_t startpos, Py_ssize_t endpos,
264 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000265
Christian Heimes190d79e2008-01-30 11:58:22 +0000266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* 0x000B, * LINE TABULATION */
271/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000273 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* 0x001C, * FILE SEPARATOR */
276/* 0x001D, * GROUP SEPARATOR */
277/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 1, 1, 1, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000283
Benjamin Peterson14339b62009-01-31 16:36:08 +0000284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000292};
293
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300294/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
295 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000297PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000299#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 /* This is actually an illegal character, so it should
303 not be passed to unichr. */
304 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000305#endif
306}
307
Victor Stinner910337b2011-10-03 03:20:16 +0200308#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200309int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200310/* FIXME: use PyObject* type for op */
311_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
337 } else {
338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
339
340 data = unicode->data.any;
341 if (kind == PyUnicode_WCHAR_KIND) {
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ascii == 0);
344 assert(ascii->state.ready == 0);
345 assert(ascii->wstr != NULL);
346 assert(data == NULL);
347 assert(compact->utf8 == NULL);
348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
349 }
350 else {
351 assert(kind == PyUnicode_1BYTE_KIND
352 || kind == PyUnicode_2BYTE_KIND
353 || kind == PyUnicode_4BYTE_KIND);
354 assert(ascii->state.compact == 0);
355 assert(ascii->state.ready == 1);
356 assert(data != NULL);
357 if (ascii->state.ascii) {
358 assert (compact->utf8 == data);
359 assert (compact->utf8_length == ascii->length);
360 }
361 else
362 assert (compact->utf8 != data);
363 }
364 }
365 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200366 if (
367#if SIZEOF_WCHAR_T == 2
368 kind == PyUnicode_2BYTE_KIND
369#else
370 kind == PyUnicode_4BYTE_KIND
371#endif
372 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 {
374 assert(ascii->wstr == data);
375 assert(compact->wstr_length == ascii->length);
376 } else
377 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200378 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200379
380 if (compact->utf8 == NULL)
381 assert(compact->utf8_length == 0);
382 if (ascii->wstr == NULL)
383 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200384 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200385 /* check that the best kind is used */
386 if (check_content && kind != PyUnicode_WCHAR_KIND)
387 {
388 Py_ssize_t i;
389 Py_UCS4 maxchar = 0;
390 void *data = PyUnicode_DATA(ascii);
391 for (i=0; i < ascii->length; i++)
392 {
393 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
394 if (ch > maxchar)
395 maxchar = ch;
396 }
397 if (kind == PyUnicode_1BYTE_KIND) {
398 if (ascii->state.ascii == 0)
399 assert(maxchar >= 128);
400 else
401 assert(maxchar < 128);
402 }
403 else if (kind == PyUnicode_2BYTE_KIND)
404 assert(maxchar >= 0x100);
405 else
406 assert(maxchar >= 0x10000);
407 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200408 if (check_content && !unicode_is_singleton((PyObject*)ascii))
409 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400410 return 1;
411}
Victor Stinner910337b2011-10-03 03:20:16 +0200412#endif
413
Thomas Wouters477c8d52006-05-27 19:21:47 +0000414/* --- Bloom Filters ----------------------------------------------------- */
415
416/* stuff to implement simple "bloom filters" for Unicode characters.
417 to keep things simple, we use a single bitmask, using the least 5
418 bits from each unicode characters as the bit index. */
419
420/* the linebreak mask is set up by Unicode_Init below */
421
Antoine Pitrouf068f942010-01-13 14:19:12 +0000422#if LONG_BIT >= 128
423#define BLOOM_WIDTH 128
424#elif LONG_BIT >= 64
425#define BLOOM_WIDTH 64
426#elif LONG_BIT >= 32
427#define BLOOM_WIDTH 32
428#else
429#error "LONG_BIT is smaller than 32"
430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432#define BLOOM_MASK unsigned long
433
434static BLOOM_MASK bloom_linebreak;
435
Antoine Pitrouf068f942010-01-13 14:19:12 +0000436#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
437#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000438
Benjamin Peterson29060642009-01-31 22:14:21 +0000439#define BLOOM_LINEBREAK(ch) \
440 ((ch) < 128U ? ascii_linebreak[(ch)] : \
441 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442
Alexander Belopolsky40018472011-02-26 01:02:56 +0000443Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200444make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000445{
446 /* calculate simple bloom-style bitmask for a given unicode string */
447
Antoine Pitrouf068f942010-01-13 14:19:12 +0000448 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449 Py_ssize_t i;
450
451 mask = 0;
452 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200453 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000454
455 return mask;
456}
457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458#define BLOOM_MEMBER(mask, chr, str) \
459 (BLOOM(mask, chr) \
460 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462/* --- Unicode Object ----------------------------------------------------- */
463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200465fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466
467Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
468 Py_ssize_t size, Py_UCS4 ch,
469 int direction)
470{
471 /* like wcschr, but doesn't stop at NULL characters */
472 Py_ssize_t i;
473 if (direction == 1) {
474 for(i = 0; i < size; i++)
475 if (PyUnicode_READ(kind, s, i) == ch)
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200476 return (char*)s + kind * i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200477 }
478 else {
479 for(i = size-1; i >= 0; i--)
480 if (PyUnicode_READ(kind, s, i) == ch)
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200481 return (char*)s + kind * i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200482 }
483 return NULL;
484}
485
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486static PyObject*
487resize_compact(PyObject *unicode, Py_ssize_t length)
488{
489 Py_ssize_t char_size;
490 Py_ssize_t struct_size;
491 Py_ssize_t new_size;
492 int share_wstr;
493
494 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200495 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200496 if (PyUnicode_IS_COMPACT_ASCII(unicode))
497 struct_size = sizeof(PyASCIIObject);
498 else
499 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200500 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200501
502 _Py_DEC_REFTOTAL;
503 _Py_ForgetReference(unicode);
504
505 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
506 PyErr_NoMemory();
507 return NULL;
508 }
509 new_size = (struct_size + (length + 1) * char_size);
510
511 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
512 if (unicode == NULL) {
513 PyObject_Del(unicode);
514 PyErr_NoMemory();
515 return NULL;
516 }
517 _Py_NewReference(unicode);
518 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200519 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200520 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200521 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
522 _PyUnicode_WSTR_LENGTH(unicode) = length;
523 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200524 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
525 length, 0);
526 return unicode;
527}
528
Alexander Belopolsky40018472011-02-26 01:02:56 +0000529static int
Victor Stinner95663112011-10-04 01:03:50 +0200530resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531{
Victor Stinner95663112011-10-04 01:03:50 +0200532 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000535
Victor Stinner95663112011-10-04 01:03:50 +0200536 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200537
538 if (PyUnicode_IS_READY(unicode)) {
539 Py_ssize_t char_size;
540 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200541 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 void *data;
543
544 data = _PyUnicode_DATA_ANY(unicode);
545 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200546 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200547 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
548 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200549 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
550 {
551 PyObject_DEL(_PyUnicode_UTF8(unicode));
552 _PyUnicode_UTF8(unicode) = NULL;
553 _PyUnicode_UTF8_LENGTH(unicode) = 0;
554 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200555
556 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
557 PyErr_NoMemory();
558 return -1;
559 }
560 new_size = (length + 1) * char_size;
561
562 data = (PyObject *)PyObject_REALLOC(data, new_size);
563 if (data == NULL) {
564 PyErr_NoMemory();
565 return -1;
566 }
567 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200568 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200570 _PyUnicode_WSTR_LENGTH(unicode) = length;
571 }
572 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200573 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200574 _PyUnicode_UTF8_LENGTH(unicode) = length;
575 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200576 _PyUnicode_LENGTH(unicode) = length;
577 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200578 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200580 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200581 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200582 }
Victor Stinner95663112011-10-04 01:03:50 +0200583 assert(_PyUnicode_WSTR(unicode) != NULL);
584
585 /* check for integer overflow */
586 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
587 PyErr_NoMemory();
588 return -1;
589 }
590 wstr = _PyUnicode_WSTR(unicode);
591 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
592 if (!wstr) {
593 PyErr_NoMemory();
594 return -1;
595 }
596 _PyUnicode_WSTR(unicode) = wstr;
597 _PyUnicode_WSTR(unicode)[length] = 0;
598 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200599 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600 return 0;
601}
602
Victor Stinnerfe226c02011-10-03 03:52:20 +0200603static PyObject*
604resize_copy(PyObject *unicode, Py_ssize_t length)
605{
606 Py_ssize_t copy_length;
607 if (PyUnicode_IS_COMPACT(unicode)) {
608 PyObject *copy;
609 assert(PyUnicode_IS_READY(unicode));
610
611 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
612 if (copy == NULL)
613 return NULL;
614
615 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200616 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200617 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200618 }
619 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200620 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621 assert(_PyUnicode_WSTR(unicode) != NULL);
622 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200623 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200624 if (w == NULL)
625 return NULL;
626 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
627 copy_length = Py_MIN(copy_length, length);
628 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
629 copy_length);
630 return (PyObject*)w;
631 }
632}
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000635 Ux0000 terminated; some code (e.g. new_identifier)
636 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637
638 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000639 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641*/
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643#ifdef Py_DEBUG
644int unicode_old_new_calls = 0;
645#endif
646
Alexander Belopolsky40018472011-02-26 01:02:56 +0000647static PyUnicodeObject *
648_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649{
650 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652
Thomas Wouters477c8d52006-05-27 19:21:47 +0000653 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 if (length == 0 && unicode_empty != NULL) {
655 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200656 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 }
658
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000659 /* Ensure we won't overflow the size. */
660 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
661 return (PyUnicodeObject *)PyErr_NoMemory();
662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663 if (length < 0) {
664 PyErr_SetString(PyExc_SystemError,
665 "Negative size passed to _PyUnicode_New");
666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669#ifdef Py_DEBUG
670 ++unicode_old_new_calls;
671#endif
672
673 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
674 if (unicode == NULL)
675 return NULL;
676 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
677 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
678 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000679 PyErr_NoMemory();
680 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200682
Jeremy Hyltond8082792003-09-16 19:41:39 +0000683 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000684 * the caller fails before initializing str -- unicode_resize()
685 * reads str[0], and the Keep-Alive optimization can keep memory
686 * allocated for str alive across a call to unicode_dealloc(unicode).
687 * We don't want unicode_resize to read uninitialized memory in
688 * that case.
689 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200690 _PyUnicode_WSTR(unicode)[0] = 0;
691 _PyUnicode_WSTR(unicode)[length] = 0;
692 _PyUnicode_WSTR_LENGTH(unicode) = length;
693 _PyUnicode_HASH(unicode) = -1;
694 _PyUnicode_STATE(unicode).interned = 0;
695 _PyUnicode_STATE(unicode).kind = 0;
696 _PyUnicode_STATE(unicode).compact = 0;
697 _PyUnicode_STATE(unicode).ready = 0;
698 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200699 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200700 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200701 _PyUnicode_UTF8(unicode) = NULL;
702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000703 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000704
Benjamin Peterson29060642009-01-31 22:14:21 +0000705 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000706 /* XXX UNREF/NEWREF interface should be more symmetrical */
707 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000708 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000709 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711}
712
Victor Stinnerf42dc442011-10-02 23:33:16 +0200713static const char*
714unicode_kind_name(PyObject *unicode)
715{
Victor Stinner42dfd712011-10-03 14:41:45 +0200716 /* don't check consistency: unicode_kind_name() is called from
717 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200718 if (!PyUnicode_IS_COMPACT(unicode))
719 {
720 if (!PyUnicode_IS_READY(unicode))
721 return "wstr";
722 switch(PyUnicode_KIND(unicode))
723 {
724 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200725 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200726 return "legacy ascii";
727 else
728 return "legacy latin1";
729 case PyUnicode_2BYTE_KIND:
730 return "legacy UCS2";
731 case PyUnicode_4BYTE_KIND:
732 return "legacy UCS4";
733 default:
734 return "<legacy invalid kind>";
735 }
736 }
737 assert(PyUnicode_IS_READY(unicode));
738 switch(PyUnicode_KIND(unicode))
739 {
740 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200742 return "ascii";
743 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200744 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200745 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200746 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200747 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200748 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200749 default:
750 return "<invalid compact kind>";
751 }
752}
753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754#ifdef Py_DEBUG
755int unicode_new_new_calls = 0;
756
757/* Functions wrapping macros for use in debugger */
758char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200759 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760}
761
762void *_PyUnicode_compact_data(void *unicode) {
763 return _PyUnicode_COMPACT_DATA(unicode);
764}
765void *_PyUnicode_data(void *unicode){
766 printf("obj %p\n", unicode);
767 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
768 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
769 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
770 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
771 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
772 return PyUnicode_DATA(unicode);
773}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200774
775void
776_PyUnicode_Dump(PyObject *op)
777{
778 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200779 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
780 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
781 void *data;
782 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
783 if (ascii->state.compact)
784 data = (compact + 1);
785 else
786 data = unicode->data.any;
787 if (ascii->wstr == data)
788 printf("shared ");
789 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200790 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200791 printf(" (%zu), ", compact->wstr_length);
792 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
793 printf("shared ");
794 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200795 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200796 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200797}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200798#endif
799
800PyObject *
801PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
802{
803 PyObject *obj;
804 PyCompactUnicodeObject *unicode;
805 void *data;
806 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200807 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 Py_ssize_t char_size;
809 Py_ssize_t struct_size;
810
811 /* Optimization for empty strings */
812 if (size == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815 }
816
817#ifdef Py_DEBUG
818 ++unicode_new_new_calls;
819#endif
820
Victor Stinner9e9d6892011-10-04 01:02:02 +0200821 is_ascii = 0;
822 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823 struct_size = sizeof(PyCompactUnicodeObject);
824 if (maxchar < 128) {
825 kind_state = PyUnicode_1BYTE_KIND;
826 char_size = 1;
827 is_ascii = 1;
828 struct_size = sizeof(PyASCIIObject);
829 }
830 else if (maxchar < 256) {
831 kind_state = PyUnicode_1BYTE_KIND;
832 char_size = 1;
833 }
834 else if (maxchar < 65536) {
835 kind_state = PyUnicode_2BYTE_KIND;
836 char_size = 2;
837 if (sizeof(wchar_t) == 2)
838 is_sharing = 1;
839 }
840 else {
841 kind_state = PyUnicode_4BYTE_KIND;
842 char_size = 4;
843 if (sizeof(wchar_t) == 4)
844 is_sharing = 1;
845 }
846
847 /* Ensure we won't overflow the size. */
848 if (size < 0) {
849 PyErr_SetString(PyExc_SystemError,
850 "Negative size passed to PyUnicode_New");
851 return NULL;
852 }
853 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
854 return PyErr_NoMemory();
855
856 /* Duplicated allocation code from _PyObject_New() instead of a call to
857 * PyObject_New() so we are able to allocate space for the object and
858 * it's data buffer.
859 */
860 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
861 if (obj == NULL)
862 return PyErr_NoMemory();
863 obj = PyObject_INIT(obj, &PyUnicode_Type);
864 if (obj == NULL)
865 return NULL;
866
867 unicode = (PyCompactUnicodeObject *)obj;
868 if (is_ascii)
869 data = ((PyASCIIObject*)obj) + 1;
870 else
871 data = unicode + 1;
872 _PyUnicode_LENGTH(unicode) = size;
873 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = kind_state;
876 _PyUnicode_STATE(unicode).compact = 1;
877 _PyUnicode_STATE(unicode).ready = 1;
878 _PyUnicode_STATE(unicode).ascii = is_ascii;
879 if (is_ascii) {
880 ((char*)data)[size] = 0;
881 _PyUnicode_WSTR(unicode) = NULL;
882 }
883 else if (kind_state == PyUnicode_1BYTE_KIND) {
884 ((char*)data)[size] = 0;
885 _PyUnicode_WSTR(unicode) = NULL;
886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200888 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 }
890 else {
891 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200892 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893 if (kind_state == PyUnicode_2BYTE_KIND)
894 ((Py_UCS2*)data)[size] = 0;
895 else /* kind_state == PyUnicode_4BYTE_KIND */
896 ((Py_UCS4*)data)[size] = 0;
897 if (is_sharing) {
898 _PyUnicode_WSTR_LENGTH(unicode) = size;
899 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
900 }
901 else {
902 _PyUnicode_WSTR_LENGTH(unicode) = 0;
903 _PyUnicode_WSTR(unicode) = NULL;
904 }
905 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200906 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907 return obj;
908}
909
910#if SIZEOF_WCHAR_T == 2
911/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
912 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200913 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200914
915 This function assumes that unicode can hold one more code point than wstr
916 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200917static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
919 PyUnicodeObject *unicode)
920{
921 const wchar_t *iter;
922 Py_UCS4 *ucs4_out;
923
Victor Stinner910337b2011-10-03 03:20:16 +0200924 assert(unicode != NULL);
925 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
927 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
928
929 for (iter = begin; iter < end; ) {
930 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
931 _PyUnicode_GET_LENGTH(unicode)));
932 if (*iter >= 0xD800 && *iter <= 0xDBFF
933 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
934 {
935 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
936 iter += 2;
937 }
938 else {
939 *ucs4_out++ = *iter;
940 iter++;
941 }
942 }
943 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
944 _PyUnicode_GET_LENGTH(unicode)));
945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946}
947#endif
948
Victor Stinnercd9950f2011-10-02 00:34:53 +0200949static int
950_PyUnicode_Dirty(PyObject *unicode)
951{
Victor Stinner910337b2011-10-03 03:20:16 +0200952 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200953 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200954 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200955 "Cannot modify a string having more than 1 reference");
956 return -1;
957 }
958 _PyUnicode_DIRTY(unicode);
959 return 0;
960}
961
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200962static int
963_copy_characters(PyObject *to, Py_ssize_t to_start,
964 PyObject *from, Py_ssize_t from_start,
965 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200967 unsigned int from_kind, to_kind;
968 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200969 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200971 assert(PyUnicode_Check(from));
972 assert(PyUnicode_Check(to));
973 assert(PyUnicode_IS_READY(from));
974 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200976 assert(PyUnicode_GET_LENGTH(from) >= how_many);
977 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
978 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200980 if (how_many == 0)
981 return 0;
982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200984 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200986 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200988#ifdef Py_DEBUG
989 if (!check_maxchar
990 && (from_kind > to_kind
991 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200992 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200993 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
994 Py_UCS4 ch;
995 Py_ssize_t i;
996 for (i=0; i < how_many; i++) {
997 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
998 assert(ch <= to_maxchar);
999 }
1000 }
1001#endif
1002 fast = (from_kind == to_kind);
1003 if (check_maxchar
1004 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1005 {
1006 /* deny latin1 => ascii */
1007 fast = 0;
1008 }
1009
1010 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001011 Py_MEMCPY((char*)to_data + to_kind * to_start,
1012 (char*)from_data + from_kind * from_start,
1013 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001014 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001015 else if (from_kind == PyUnicode_1BYTE_KIND
1016 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001017 {
1018 _PyUnicode_CONVERT_BYTES(
1019 Py_UCS1, Py_UCS2,
1020 PyUnicode_1BYTE_DATA(from) + from_start,
1021 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1022 PyUnicode_2BYTE_DATA(to) + to_start
1023 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001024 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001025 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001026 && to_kind == PyUnicode_4BYTE_KIND)
1027 {
1028 _PyUnicode_CONVERT_BYTES(
1029 Py_UCS1, Py_UCS4,
1030 PyUnicode_1BYTE_DATA(from) + from_start,
1031 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1032 PyUnicode_4BYTE_DATA(to) + to_start
1033 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001034 }
1035 else if (from_kind == PyUnicode_2BYTE_KIND
1036 && to_kind == PyUnicode_4BYTE_KIND)
1037 {
1038 _PyUnicode_CONVERT_BYTES(
1039 Py_UCS2, Py_UCS4,
1040 PyUnicode_2BYTE_DATA(from) + from_start,
1041 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1042 PyUnicode_4BYTE_DATA(to) + to_start
1043 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001044 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001045 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001046 /* check if max_char(from substring) <= max_char(to) */
1047 if (from_kind > to_kind
1048 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001049 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001050 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001051 /* slow path to check for character overflow */
1052 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001054 Py_ssize_t i;
1055
Victor Stinner56c161a2011-10-06 02:47:11 +02001056#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001057 for (i=0; i < how_many; i++) {
1058 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001059 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001060 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1061 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001062#else
1063 if (!check_maxchar) {
1064 for (i=0; i < how_many; i++) {
1065 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1066 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1067 }
1068 }
1069 else {
1070 for (i=0; i < how_many; i++) {
1071 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1072 if (ch > to_maxchar)
1073 return 1;
1074 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1075 }
1076 }
1077#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001078 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001079 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001080 assert(0 && "inconsistent state");
1081 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001082 }
1083 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001084 return 0;
1085}
1086
1087static void
1088copy_characters(PyObject *to, Py_ssize_t to_start,
1089 PyObject *from, Py_ssize_t from_start,
1090 Py_ssize_t how_many)
1091{
1092 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1093}
1094
1095Py_ssize_t
1096PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1097 PyObject *from, Py_ssize_t from_start,
1098 Py_ssize_t how_many)
1099{
1100 int err;
1101
1102 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1103 PyErr_BadInternalCall();
1104 return -1;
1105 }
1106
1107 if (PyUnicode_READY(from))
1108 return -1;
1109 if (PyUnicode_READY(to))
1110 return -1;
1111
1112 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1113 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1114 PyErr_Format(PyExc_SystemError,
1115 "Cannot write %zi characters at %zi "
1116 "in a string of %zi characters",
1117 how_many, to_start, PyUnicode_GET_LENGTH(to));
1118 return -1;
1119 }
1120
1121 if (how_many == 0)
1122 return 0;
1123
1124 if (_PyUnicode_Dirty(to))
1125 return -1;
1126
1127 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1128 if (err) {
1129 PyErr_Format(PyExc_SystemError,
1130 "Cannot copy %s characters "
1131 "into a string of %s characters",
1132 unicode_kind_name(from),
1133 unicode_kind_name(to));
1134 return -1;
1135 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001136 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137}
1138
Victor Stinner17222162011-09-28 22:15:37 +02001139/* Find the maximum code point and count the number of surrogate pairs so a
1140 correct string length can be computed before converting a string to UCS4.
1141 This function counts single surrogates as a character and not as a pair.
1142
1143 Return 0 on success, or -1 on error. */
1144static int
1145find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1146 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147{
1148 const wchar_t *iter;
1149
Victor Stinnerc53be962011-10-02 21:33:54 +02001150 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151 *num_surrogates = 0;
1152 *maxchar = 0;
1153
1154 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001155 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001157#if SIZEOF_WCHAR_T != 2
1158 if (*maxchar >= 0x10000)
1159 return 0;
1160#endif
1161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162#if SIZEOF_WCHAR_T == 2
1163 if (*iter >= 0xD800 && *iter <= 0xDBFF
1164 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1165 {
1166 Py_UCS4 surrogate_val;
1167 surrogate_val = (((iter[0] & 0x3FF)<<10)
1168 | (iter[1] & 0x3FF)) + 0x10000;
1169 ++(*num_surrogates);
1170 if (surrogate_val > *maxchar)
1171 *maxchar = surrogate_val;
1172 iter += 2;
1173 }
1174 else
1175 iter++;
1176#else
1177 iter++;
1178#endif
1179 }
1180 return 0;
1181}
1182
1183#ifdef Py_DEBUG
1184int unicode_ready_calls = 0;
1185#endif
1186
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001187static int
1188unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001190 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191 wchar_t *end;
1192 Py_UCS4 maxchar = 0;
1193 Py_ssize_t num_surrogates;
1194#if SIZEOF_WCHAR_T == 2
1195 Py_ssize_t length_wo_surrogates;
1196#endif
1197
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001198 assert(p_obj != NULL);
1199 unicode = (PyUnicodeObject *)*p_obj;
1200
Georg Brandl7597add2011-10-05 16:36:47 +02001201 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001202 strings were created using _PyObject_New() and where no canonical
1203 representation (the str field) has been set yet aka strings
1204 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001205 assert(_PyUnicode_CHECK(unicode));
1206 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001208 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001209 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001210 /* Actually, it should neither be interned nor be anything else: */
1211 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212
1213#ifdef Py_DEBUG
1214 ++unicode_ready_calls;
1215#endif
1216
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001217#ifdef Py_DEBUG
1218 assert(!replace || Py_REFCNT(unicode) == 1);
1219#else
1220 if (replace && Py_REFCNT(unicode) != 1)
1221 replace = 0;
1222#endif
1223 if (replace) {
1224 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1225 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1226 /* Optimization for empty strings */
1227 if (len == 0) {
1228 Py_INCREF(unicode_empty);
1229 Py_DECREF(*p_obj);
1230 *p_obj = unicode_empty;
1231 return 0;
1232 }
1233 if (len == 1 && wstr[0] < 256) {
1234 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1235 if (latin1_char == NULL)
1236 return -1;
1237 Py_DECREF(*p_obj);
1238 *p_obj = latin1_char;
1239 return 0;
1240 }
1241 }
1242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001244 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001245 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247
1248 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1250 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 PyErr_NoMemory();
1252 return -1;
1253 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001254 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 _PyUnicode_WSTR(unicode), end,
1256 PyUnicode_1BYTE_DATA(unicode));
1257 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1258 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1259 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1260 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001261 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001262 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001263 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 }
1265 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001266 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001267 _PyUnicode_UTF8(unicode) = NULL;
1268 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 }
1270 PyObject_FREE(_PyUnicode_WSTR(unicode));
1271 _PyUnicode_WSTR(unicode) = NULL;
1272 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1273 }
1274 /* In this case we might have to convert down from 4-byte native
1275 wchar_t to 2-byte unicode. */
1276 else if (maxchar < 65536) {
1277 assert(num_surrogates == 0 &&
1278 "FindMaxCharAndNumSurrogatePairs() messed up");
1279
Victor Stinner506f5922011-09-28 22:34:18 +02001280#if SIZEOF_WCHAR_T == 2
1281 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001282 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001283 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1284 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1285 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001286 _PyUnicode_UTF8(unicode) = NULL;
1287 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001288#else
1289 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001290 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001291 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001292 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001293 PyErr_NoMemory();
1294 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295 }
Victor Stinner506f5922011-09-28 22:34:18 +02001296 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1297 _PyUnicode_WSTR(unicode), end,
1298 PyUnicode_2BYTE_DATA(unicode));
1299 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1300 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1301 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001302 _PyUnicode_UTF8(unicode) = NULL;
1303 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001304 PyObject_FREE(_PyUnicode_WSTR(unicode));
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1307#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 }
1309 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1310 else {
1311#if SIZEOF_WCHAR_T == 2
1312 /* in case the native representation is 2-bytes, we need to allocate a
1313 new normalized 4-byte version. */
1314 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001315 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1316 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 PyErr_NoMemory();
1318 return -1;
1319 }
1320 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1321 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001322 _PyUnicode_UTF8(unicode) = NULL;
1323 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001324 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1325 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001326 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 PyObject_FREE(_PyUnicode_WSTR(unicode));
1328 _PyUnicode_WSTR(unicode) = NULL;
1329 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1330#else
1331 assert(num_surrogates == 0);
1332
Victor Stinnerc3c74152011-10-02 20:39:55 +02001333 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001335 _PyUnicode_UTF8(unicode) = NULL;
1336 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1338#endif
1339 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1340 }
1341 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001342 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 return 0;
1344}
1345
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001346int
1347_PyUnicode_ReadyReplace(PyObject **op)
1348{
1349 return unicode_ready(op, 1);
1350}
1351
1352int
1353_PyUnicode_Ready(PyObject *op)
1354{
1355 return unicode_ready(&op, 0);
1356}
1357
Alexander Belopolsky40018472011-02-26 01:02:56 +00001358static void
1359unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360{
Walter Dörwald16807132007-05-25 13:52:07 +00001361 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 case SSTATE_NOT_INTERNED:
1363 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001364
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 case SSTATE_INTERNED_MORTAL:
1366 /* revive dead object temporarily for DelItem */
1367 Py_REFCNT(unicode) = 3;
1368 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1369 Py_FatalError(
1370 "deletion of interned string failed");
1371 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001372
Benjamin Peterson29060642009-01-31 22:14:21 +00001373 case SSTATE_INTERNED_IMMORTAL:
1374 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001375
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 default:
1377 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001378 }
1379
Victor Stinner03490912011-10-03 23:45:12 +02001380 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001382 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001383 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384
1385 if (PyUnicode_IS_COMPACT(unicode)) {
1386 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 }
1388 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001389 if (_PyUnicode_DATA_ANY(unicode))
1390 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001391 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
1393}
1394
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395#ifdef Py_DEBUG
1396static int
1397unicode_is_singleton(PyObject *unicode)
1398{
1399 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1400 if (unicode == unicode_empty)
1401 return 1;
1402 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1403 {
1404 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1405 if (ch < 256 && unicode_latin1[ch] == unicode)
1406 return 1;
1407 }
1408 return 0;
1409}
1410#endif
1411
Alexander Belopolsky40018472011-02-26 01:02:56 +00001412static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001413unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001415 if (Py_REFCNT(unicode) != 1)
1416 return 0;
1417 if (PyUnicode_CHECK_INTERNED(unicode))
1418 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001419#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001420 /* singleton refcount is greater than 1 */
1421 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001422#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001423 return 1;
1424}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001425
Victor Stinnerfe226c02011-10-03 03:52:20 +02001426static int
1427unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1428{
1429 PyObject *unicode;
1430 Py_ssize_t old_length;
1431
1432 assert(p_unicode != NULL);
1433 unicode = *p_unicode;
1434
1435 assert(unicode != NULL);
1436 assert(PyUnicode_Check(unicode));
1437 assert(0 <= length);
1438
Victor Stinner910337b2011-10-03 03:20:16 +02001439 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001440 old_length = PyUnicode_WSTR_LENGTH(unicode);
1441 else
1442 old_length = PyUnicode_GET_LENGTH(unicode);
1443 if (old_length == length)
1444 return 0;
1445
Victor Stinnerfe226c02011-10-03 03:52:20 +02001446 if (!unicode_resizable(unicode)) {
1447 PyObject *copy = resize_copy(unicode, length);
1448 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001449 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001450 Py_DECREF(*p_unicode);
1451 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001452 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001453 }
1454
Victor Stinnerfe226c02011-10-03 03:52:20 +02001455 if (PyUnicode_IS_COMPACT(unicode)) {
1456 *p_unicode = resize_compact(unicode, length);
1457 if (*p_unicode == NULL)
1458 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001459 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001460 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001461 }
1462 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001463}
1464
Alexander Belopolsky40018472011-02-26 01:02:56 +00001465int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001466PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001467{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001468 PyObject *unicode;
1469 if (p_unicode == NULL) {
1470 PyErr_BadInternalCall();
1471 return -1;
1472 }
1473 unicode = *p_unicode;
1474 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1475 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1476 {
1477 PyErr_BadInternalCall();
1478 return -1;
1479 }
1480 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001481}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483static PyObject*
1484get_latin1_char(unsigned char ch)
1485{
Victor Stinnera464fc12011-10-02 20:39:30 +02001486 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001488 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 if (!unicode)
1490 return NULL;
1491 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001492 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 unicode_latin1[ch] = unicode;
1494 }
1495 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001496 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497}
1498
Alexander Belopolsky40018472011-02-26 01:02:56 +00001499PyObject *
1500PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501{
1502 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503 Py_UCS4 maxchar = 0;
1504 Py_ssize_t num_surrogates;
1505
1506 if (u == NULL)
1507 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001509 /* If the Unicode data is known at construction time, we can apply
1510 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 /* Optimization for empty strings */
1513 if (size == 0 && unicode_empty != NULL) {
1514 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001515 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001516 }
Tim Petersced69f82003-09-16 20:30:58 +00001517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 /* Single character Unicode objects in the Latin-1 range are
1519 shared when using this constructor */
1520 if (size == 1 && *u < 256)
1521 return get_latin1_char((unsigned char)*u);
1522
1523 /* If not empty and not single character, copy the Unicode data
1524 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001525 if (find_maxchar_surrogates(u, u + size,
1526 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 return NULL;
1528
1529 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1530 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 if (!unicode)
1532 return NULL;
1533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 switch (PyUnicode_KIND(unicode)) {
1535 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001536 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1538 break;
1539 case PyUnicode_2BYTE_KIND:
1540#if Py_UNICODE_SIZE == 2
1541 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1542#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001543 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1545#endif
1546 break;
1547 case PyUnicode_4BYTE_KIND:
1548#if SIZEOF_WCHAR_T == 2
1549 /* This is the only case which has to process surrogates, thus
1550 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001551 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552#else
1553 assert(num_surrogates == 0);
1554 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1555#endif
1556 break;
1557 default:
1558 assert(0 && "Impossible state");
1559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001561 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 return (PyObject *)unicode;
1563}
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565PyObject *
1566PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001567{
1568 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001569
Benjamin Peterson14339b62009-01-31 16:36:08 +00001570 if (size < 0) {
1571 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001572 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001573 return NULL;
1574 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001575
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001576 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001577 some optimizations which share commonly used objects.
1578 Also, this means the input must be UTF-8, so fall back to the
1579 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001580 if (u != NULL) {
1581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 /* Optimization for empty strings */
1583 if (size == 0 && unicode_empty != NULL) {
1584 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001585 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001586 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001587
1588 /* Single characters are shared when using this constructor.
1589 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 if (size == 1 && Py_CHARMASK(*u) < 128)
1591 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001592
1593 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001594 }
1595
Walter Dörwald55507312007-05-18 13:12:10 +00001596 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001597 if (!unicode)
1598 return NULL;
1599
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001600 return (PyObject *)unicode;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001605{
1606 size_t size = strlen(u);
1607 if (size > PY_SSIZE_T_MAX) {
1608 PyErr_SetString(PyExc_OverflowError, "input too long");
1609 return NULL;
1610 }
1611
1612 return PyUnicode_FromStringAndSize(u, size);
1613}
1614
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001615PyObject *
1616_PyUnicode_FromId(_Py_Identifier *id)
1617{
1618 if (!id->object) {
1619 id->object = PyUnicode_FromString(id->string);
1620 if (!id->object)
1621 return NULL;
1622 PyUnicode_InternInPlace(&id->object);
1623 assert(!id->next);
1624 id->next = static_strings;
1625 static_strings = id;
1626 }
1627 Py_INCREF(id->object);
1628 return id->object;
1629}
1630
1631void
1632_PyUnicode_ClearStaticStrings()
1633{
1634 _Py_Identifier *i;
1635 for (i = static_strings; i; i = i->next) {
1636 Py_DECREF(i->object);
1637 i->object = NULL;
1638 i->next = NULL;
1639 }
1640}
1641
Victor Stinnere57b1c02011-09-28 22:20:48 +02001642static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001643unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001644{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001645 PyObject *res;
1646#ifdef Py_DEBUG
1647 const unsigned char *p;
1648 const unsigned char *end = s + size;
1649 for (p=s; p < end; p++) {
1650 assert(*p < 128);
1651 }
1652#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001653 if (size == 1)
1654 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001655 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001656 if (!res)
1657 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001658 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001659 return res;
1660}
1661
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001662static Py_UCS4
1663kind_maxchar_limit(unsigned int kind)
1664{
1665 switch(kind) {
1666 case PyUnicode_1BYTE_KIND:
1667 return 0x80;
1668 case PyUnicode_2BYTE_KIND:
1669 return 0x100;
1670 case PyUnicode_4BYTE_KIND:
1671 return 0x10000;
1672 default:
1673 assert(0 && "invalid kind");
1674 return 0x10ffff;
1675 }
1676}
1677
Victor Stinner702c7342011-10-05 13:50:52 +02001678static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001679_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001682 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001684
1685 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001686 if (size == 1)
1687 return get_latin1_char(u[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 for (i = 0; i < size; i++) {
1689 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001690 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001692 }
1693 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001694 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695 if (!res)
1696 return NULL;
1697 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001698 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001700}
1701
Victor Stinnere57b1c02011-09-28 22:20:48 +02001702static PyObject*
1703_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704{
1705 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001706 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001708
1709 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001710 if (size == 1 && u[0] < 256)
1711 return get_latin1_char(u[0]);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001712 for (i = 0; i < size; i++) {
1713 if (u[i] > max_char) {
1714 max_char = u[i];
1715 if (max_char >= 256)
1716 break;
1717 }
1718 }
1719 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 if (!res)
1721 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001722 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1724 else
1725 for (i = 0; i < size; i++)
1726 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001727 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 return res;
1729}
1730
Victor Stinnere57b1c02011-09-28 22:20:48 +02001731static PyObject*
1732_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733{
1734 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001735 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001737
1738 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001739 if (size == 1 && u[0] < 256)
1740 return get_latin1_char(u[0]);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001741 for (i = 0; i < size; i++) {
1742 if (u[i] > max_char) {
1743 max_char = u[i];
1744 if (max_char >= 0x10000)
1745 break;
1746 }
1747 }
1748 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 if (!res)
1750 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001751 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1753 else {
1754 int kind = PyUnicode_KIND(res);
1755 void *data = PyUnicode_DATA(res);
1756 for (i = 0; i < size; i++)
1757 PyUnicode_WRITE(kind, data, i, u[i]);
1758 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001759 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return res;
1761}
1762
1763PyObject*
1764PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1765{
1766 switch(kind) {
1767 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001768 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001770 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001772 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001773 default:
1774 assert(0 && "invalid kind");
1775 PyErr_SetString(PyExc_SystemError, "invalid kind");
1776 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778}
1779
Victor Stinner25a4b292011-10-06 12:31:55 +02001780/* Ensure that a string uses the most efficient storage, if it is not the
1781 case: create a new string with of the right kind. Write NULL into *p_unicode
1782 on error. */
1783void
1784unicode_adjust_maxchar(PyObject **p_unicode)
1785{
1786 PyObject *unicode, *copy;
1787 Py_UCS4 max_char;
1788 Py_ssize_t i, len;
1789 unsigned int kind;
1790
1791 assert(p_unicode != NULL);
1792 unicode = *p_unicode;
1793 assert(PyUnicode_IS_READY(unicode));
1794 if (PyUnicode_IS_ASCII(unicode))
1795 return;
1796
1797 len = PyUnicode_GET_LENGTH(unicode);
1798 kind = PyUnicode_KIND(unicode);
1799 if (kind == PyUnicode_1BYTE_KIND) {
1800 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1801 for (i = 0; i < len; i++) {
1802 if (u[i] & 0x80)
1803 return;
1804 }
1805 max_char = 127;
1806 }
1807 else if (kind == PyUnicode_2BYTE_KIND) {
1808 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1809 max_char = 0;
1810 for (i = 0; i < len; i++) {
1811 if (u[i] > max_char) {
1812 max_char = u[i];
1813 if (max_char >= 256)
1814 return;
1815 }
1816 }
1817 }
1818 else {
Antoine Pitrou15a66cf2011-10-06 15:25:32 +02001819 const Py_UCS4 *u;
Victor Stinner25a4b292011-10-06 12:31:55 +02001820 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitrou15a66cf2011-10-06 15:25:32 +02001821 u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001822 max_char = 0;
1823 for (i = 0; i < len; i++) {
1824 if (u[i] > max_char) {
1825 max_char = u[i];
1826 if (max_char >= 0x10000)
1827 return;
1828 }
1829 }
1830 }
Victor Stinner200f2132011-10-06 13:27:56 +02001831 assert(max_char < PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinner25a4b292011-10-06 12:31:55 +02001832 copy = PyUnicode_New(len, max_char);
1833 copy_characters(copy, 0, unicode, 0, len);
1834 Py_DECREF(unicode);
1835 *p_unicode = copy;
1836}
1837
Victor Stinner034f6cf2011-09-30 02:26:44 +02001838PyObject*
1839PyUnicode_Copy(PyObject *unicode)
1840{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001841 Py_ssize_t size;
1842 PyObject *copy;
1843 void *data;
1844
Victor Stinner034f6cf2011-09-30 02:26:44 +02001845 if (!PyUnicode_Check(unicode)) {
1846 PyErr_BadInternalCall();
1847 return NULL;
1848 }
1849 if (PyUnicode_READY(unicode))
1850 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001851
1852 size = PyUnicode_GET_LENGTH(unicode);
1853 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1854 if (!copy)
1855 return NULL;
1856 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1857
1858 data = PyUnicode_DATA(unicode);
1859 switch (PyUnicode_KIND(unicode))
1860 {
1861 case PyUnicode_1BYTE_KIND:
1862 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1863 break;
1864 case PyUnicode_2BYTE_KIND:
1865 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1866 break;
1867 case PyUnicode_4BYTE_KIND:
1868 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1869 break;
1870 default:
1871 assert(0);
1872 break;
1873 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001874 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001875 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001876}
1877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878
Victor Stinnerbc603d12011-10-02 01:00:40 +02001879/* Widen Unicode objects to larger buffers. Don't write terminating null
1880 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881
1882void*
1883_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1884{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001885 Py_ssize_t len;
1886 void *result;
1887 unsigned int skind;
1888
1889 if (PyUnicode_READY(s))
1890 return NULL;
1891
1892 len = PyUnicode_GET_LENGTH(s);
1893 skind = PyUnicode_KIND(s);
1894 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001895 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 return NULL;
1897 }
1898 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001899 case PyUnicode_2BYTE_KIND:
1900 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1901 if (!result)
1902 return PyErr_NoMemory();
1903 assert(skind == PyUnicode_1BYTE_KIND);
1904 _PyUnicode_CONVERT_BYTES(
1905 Py_UCS1, Py_UCS2,
1906 PyUnicode_1BYTE_DATA(s),
1907 PyUnicode_1BYTE_DATA(s) + len,
1908 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001910 case PyUnicode_4BYTE_KIND:
1911 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1912 if (!result)
1913 return PyErr_NoMemory();
1914 if (skind == PyUnicode_2BYTE_KIND) {
1915 _PyUnicode_CONVERT_BYTES(
1916 Py_UCS2, Py_UCS4,
1917 PyUnicode_2BYTE_DATA(s),
1918 PyUnicode_2BYTE_DATA(s) + len,
1919 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001921 else {
1922 assert(skind == PyUnicode_1BYTE_KIND);
1923 _PyUnicode_CONVERT_BYTES(
1924 Py_UCS1, Py_UCS4,
1925 PyUnicode_1BYTE_DATA(s),
1926 PyUnicode_1BYTE_DATA(s) + len,
1927 result);
1928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001930 default:
1931 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 }
Victor Stinner01698042011-10-04 00:04:26 +02001933 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 return NULL;
1935}
1936
1937static Py_UCS4*
1938as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1939 int copy_null)
1940{
1941 int kind;
1942 void *data;
1943 Py_ssize_t len, targetlen;
1944 if (PyUnicode_READY(string) == -1)
1945 return NULL;
1946 kind = PyUnicode_KIND(string);
1947 data = PyUnicode_DATA(string);
1948 len = PyUnicode_GET_LENGTH(string);
1949 targetlen = len;
1950 if (copy_null)
1951 targetlen++;
1952 if (!target) {
1953 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1954 PyErr_NoMemory();
1955 return NULL;
1956 }
1957 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1958 if (!target) {
1959 PyErr_NoMemory();
1960 return NULL;
1961 }
1962 }
1963 else {
1964 if (targetsize < targetlen) {
1965 PyErr_Format(PyExc_SystemError,
1966 "string is longer than the buffer");
1967 if (copy_null && 0 < targetsize)
1968 target[0] = 0;
1969 return NULL;
1970 }
1971 }
1972 if (kind != PyUnicode_4BYTE_KIND) {
1973 Py_ssize_t i;
1974 for (i = 0; i < len; i++)
1975 target[i] = PyUnicode_READ(kind, data, i);
1976 }
1977 else
1978 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1979 if (copy_null)
1980 target[len] = 0;
1981 return target;
1982}
1983
1984Py_UCS4*
1985PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1986 int copy_null)
1987{
1988 if (target == NULL || targetsize < 1) {
1989 PyErr_BadInternalCall();
1990 return NULL;
1991 }
1992 return as_ucs4(string, target, targetsize, copy_null);
1993}
1994
1995Py_UCS4*
1996PyUnicode_AsUCS4Copy(PyObject *string)
1997{
1998 return as_ucs4(string, NULL, 0, 1);
1999}
2000
2001#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002002
Alexander Belopolsky40018472011-02-26 01:02:56 +00002003PyObject *
2004PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002007 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 PyErr_BadInternalCall();
2010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 }
2012
Martin v. Löwis790465f2008-04-05 20:41:37 +00002013 if (size == -1) {
2014 size = wcslen(w);
2015 }
2016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018}
2019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002021
Walter Dörwald346737f2007-05-31 10:44:43 +00002022static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002023makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2024 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002025{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002026 *fmt++ = '%';
2027 if (width) {
2028 if (zeropad)
2029 *fmt++ = '0';
2030 fmt += sprintf(fmt, "%d", width);
2031 }
2032 if (precision)
2033 fmt += sprintf(fmt, ".%d", precision);
2034 if (longflag)
2035 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002036 else if (longlongflag) {
2037 /* longlongflag should only ever be nonzero on machines with
2038 HAVE_LONG_LONG defined */
2039#ifdef HAVE_LONG_LONG
2040 char *f = PY_FORMAT_LONG_LONG;
2041 while (*f)
2042 *fmt++ = *f++;
2043#else
2044 /* we shouldn't ever get here */
2045 assert(0);
2046 *fmt++ = 'l';
2047#endif
2048 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002049 else if (size_tflag) {
2050 char *f = PY_FORMAT_SIZE_T;
2051 while (*f)
2052 *fmt++ = *f++;
2053 }
2054 *fmt++ = c;
2055 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002056}
2057
Victor Stinner96865452011-03-01 23:44:09 +00002058/* helper for PyUnicode_FromFormatV() */
2059
2060static const char*
2061parse_format_flags(const char *f,
2062 int *p_width, int *p_precision,
2063 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2064{
2065 int width, precision, longflag, longlongflag, size_tflag;
2066
2067 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2068 f++;
2069 width = 0;
2070 while (Py_ISDIGIT((unsigned)*f))
2071 width = (width*10) + *f++ - '0';
2072 precision = 0;
2073 if (*f == '.') {
2074 f++;
2075 while (Py_ISDIGIT((unsigned)*f))
2076 precision = (precision*10) + *f++ - '0';
2077 if (*f == '%') {
2078 /* "%.3%s" => f points to "3" */
2079 f--;
2080 }
2081 }
2082 if (*f == '\0') {
2083 /* bogus format "%.1" => go backward, f points to "1" */
2084 f--;
2085 }
2086 if (p_width != NULL)
2087 *p_width = width;
2088 if (p_precision != NULL)
2089 *p_precision = precision;
2090
2091 /* Handle %ld, %lu, %lld and %llu. */
2092 longflag = 0;
2093 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002094 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002095
2096 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002097 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002098 longflag = 1;
2099 ++f;
2100 }
2101#ifdef HAVE_LONG_LONG
2102 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002103 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002104 longlongflag = 1;
2105 f += 2;
2106 }
2107#endif
2108 }
2109 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002110 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002111 size_tflag = 1;
2112 ++f;
2113 }
2114 if (p_longflag != NULL)
2115 *p_longflag = longflag;
2116 if (p_longlongflag != NULL)
2117 *p_longlongflag = longlongflag;
2118 if (p_size_tflag != NULL)
2119 *p_size_tflag = size_tflag;
2120 return f;
2121}
2122
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002123/* maximum number of characters required for output of %ld. 21 characters
2124 allows for 64-bit integers (in decimal) and an optional sign. */
2125#define MAX_LONG_CHARS 21
2126/* maximum number of characters required for output of %lld.
2127 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2128 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2129#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2130
Walter Dörwaldd2034312007-05-18 16:29:38 +00002131PyObject *
2132PyUnicode_FromFormatV(const char *format, va_list vargs)
2133{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002134 va_list count;
2135 Py_ssize_t callcount = 0;
2136 PyObject **callresults = NULL;
2137 PyObject **callresult = NULL;
2138 Py_ssize_t n = 0;
2139 int width = 0;
2140 int precision = 0;
2141 int zeropad;
2142 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002143 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002145 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2147 Py_UCS4 argmaxchar;
2148 Py_ssize_t numbersize = 0;
2149 char *numberresults = NULL;
2150 char *numberresult = NULL;
2151 Py_ssize_t i;
2152 int kind;
2153 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002154
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002155 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002156 /* step 1: count the number of %S/%R/%A/%s format specifications
2157 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2158 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002160 * also estimate a upper bound for all the number formats in the string,
2161 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002163 for (f = format; *f; f++) {
2164 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002165 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2167 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2168 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2169 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002172#ifdef HAVE_LONG_LONG
2173 if (longlongflag) {
2174 if (width < MAX_LONG_LONG_CHARS)
2175 width = MAX_LONG_LONG_CHARS;
2176 }
2177 else
2178#endif
2179 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2180 including sign. Decimal takes the most space. This
2181 isn't enough for octal. If a width is specified we
2182 need more (which we allocate later). */
2183 if (width < MAX_LONG_CHARS)
2184 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185
2186 /* account for the size + '\0' to separate numbers
2187 inside of the numberresults buffer */
2188 numbersize += (width + 1);
2189 }
2190 }
2191 else if ((unsigned char)*f > 127) {
2192 PyErr_Format(PyExc_ValueError,
2193 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2194 "string, got a non-ASCII byte: 0x%02x",
2195 (unsigned char)*f);
2196 return NULL;
2197 }
2198 }
2199 /* step 2: allocate memory for the results of
2200 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2201 if (callcount) {
2202 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2203 if (!callresults) {
2204 PyErr_NoMemory();
2205 return NULL;
2206 }
2207 callresult = callresults;
2208 }
2209 /* step 2.5: allocate memory for the results of formating numbers */
2210 if (numbersize) {
2211 numberresults = PyObject_Malloc(numbersize);
2212 if (!numberresults) {
2213 PyErr_NoMemory();
2214 goto fail;
2215 }
2216 numberresult = numberresults;
2217 }
2218
2219 /* step 3: format numbers and figure out how large a buffer we need */
2220 for (f = format; *f; f++) {
2221 if (*f == '%') {
2222 const char* p;
2223 int longflag;
2224 int longlongflag;
2225 int size_tflag;
2226 int numprinted;
2227
2228 p = f;
2229 zeropad = (f[1] == '0');
2230 f = parse_format_flags(f, &width, &precision,
2231 &longflag, &longlongflag, &size_tflag);
2232 switch (*f) {
2233 case 'c':
2234 {
2235 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002236 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 n++;
2238 break;
2239 }
2240 case '%':
2241 n++;
2242 break;
2243 case 'i':
2244 case 'd':
2245 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2246 width, precision, *f);
2247 if (longflag)
2248 numprinted = sprintf(numberresult, fmt,
2249 va_arg(count, long));
2250#ifdef HAVE_LONG_LONG
2251 else if (longlongflag)
2252 numprinted = sprintf(numberresult, fmt,
2253 va_arg(count, PY_LONG_LONG));
2254#endif
2255 else if (size_tflag)
2256 numprinted = sprintf(numberresult, fmt,
2257 va_arg(count, Py_ssize_t));
2258 else
2259 numprinted = sprintf(numberresult, fmt,
2260 va_arg(count, int));
2261 n += numprinted;
2262 /* advance by +1 to skip over the '\0' */
2263 numberresult += (numprinted + 1);
2264 assert(*(numberresult - 1) == '\0');
2265 assert(*(numberresult - 2) != '\0');
2266 assert(numprinted >= 0);
2267 assert(numberresult <= numberresults + numbersize);
2268 break;
2269 case 'u':
2270 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2271 width, precision, 'u');
2272 if (longflag)
2273 numprinted = sprintf(numberresult, fmt,
2274 va_arg(count, unsigned long));
2275#ifdef HAVE_LONG_LONG
2276 else if (longlongflag)
2277 numprinted = sprintf(numberresult, fmt,
2278 va_arg(count, unsigned PY_LONG_LONG));
2279#endif
2280 else if (size_tflag)
2281 numprinted = sprintf(numberresult, fmt,
2282 va_arg(count, size_t));
2283 else
2284 numprinted = sprintf(numberresult, fmt,
2285 va_arg(count, unsigned int));
2286 n += numprinted;
2287 numberresult += (numprinted + 1);
2288 assert(*(numberresult - 1) == '\0');
2289 assert(*(numberresult - 2) != '\0');
2290 assert(numprinted >= 0);
2291 assert(numberresult <= numberresults + numbersize);
2292 break;
2293 case 'x':
2294 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2295 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2296 n += numprinted;
2297 numberresult += (numprinted + 1);
2298 assert(*(numberresult - 1) == '\0');
2299 assert(*(numberresult - 2) != '\0');
2300 assert(numprinted >= 0);
2301 assert(numberresult <= numberresults + numbersize);
2302 break;
2303 case 'p':
2304 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2305 /* %p is ill-defined: ensure leading 0x. */
2306 if (numberresult[1] == 'X')
2307 numberresult[1] = 'x';
2308 else if (numberresult[1] != 'x') {
2309 memmove(numberresult + 2, numberresult,
2310 strlen(numberresult) + 1);
2311 numberresult[0] = '0';
2312 numberresult[1] = 'x';
2313 numprinted += 2;
2314 }
2315 n += numprinted;
2316 numberresult += (numprinted + 1);
2317 assert(*(numberresult - 1) == '\0');
2318 assert(*(numberresult - 2) != '\0');
2319 assert(numprinted >= 0);
2320 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 break;
2322 case 's':
2323 {
2324 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002325 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002326 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2327 if (!str)
2328 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 /* since PyUnicode_DecodeUTF8 returns already flexible
2330 unicode objects, there is no need to call ready on them */
2331 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002332 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002334 /* Remember the str and switch to the next slot */
2335 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002336 break;
2337 }
2338 case 'U':
2339 {
2340 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002341 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 if (PyUnicode_READY(obj) == -1)
2343 goto fail;
2344 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002345 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002346 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002347 break;
2348 }
2349 case 'V':
2350 {
2351 PyObject *obj = va_arg(count, PyObject *);
2352 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002353 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002354 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002355 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002356 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357 if (PyUnicode_READY(obj) == -1)
2358 goto fail;
2359 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002360 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002362 *callresult++ = NULL;
2363 }
2364 else {
2365 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2366 if (!str_obj)
2367 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002368 if (PyUnicode_READY(str_obj)) {
2369 Py_DECREF(str_obj);
2370 goto fail;
2371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002373 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002375 *callresult++ = str_obj;
2376 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002377 break;
2378 }
2379 case 'S':
2380 {
2381 PyObject *obj = va_arg(count, PyObject *);
2382 PyObject *str;
2383 assert(obj);
2384 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002386 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002388 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 /* Remember the str and switch to the next slot */
2391 *callresult++ = str;
2392 break;
2393 }
2394 case 'R':
2395 {
2396 PyObject *obj = va_arg(count, PyObject *);
2397 PyObject *repr;
2398 assert(obj);
2399 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002403 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002405 /* Remember the repr and switch to the next slot */
2406 *callresult++ = repr;
2407 break;
2408 }
2409 case 'A':
2410 {
2411 PyObject *obj = va_arg(count, PyObject *);
2412 PyObject *ascii;
2413 assert(obj);
2414 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002416 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002418 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 /* Remember the repr and switch to the next slot */
2421 *callresult++ = ascii;
2422 break;
2423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002424 default:
2425 /* if we stumble upon an unknown
2426 formatting code, copy the rest of
2427 the format string to the output
2428 string. (we cannot just skip the
2429 code, since there's no way to know
2430 what's in the argument list) */
2431 n += strlen(p);
2432 goto expand;
2433 }
2434 } else
2435 n++;
2436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002437 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002438 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 we don't have to resize the string.
2441 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002442 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002443 if (!string)
2444 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 kind = PyUnicode_KIND(string);
2446 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002447 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002451 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002452 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002453
2454 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2456 /* checking for == because the last argument could be a empty
2457 string, which causes i to point to end, the assert at the end of
2458 the loop */
2459 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002460
Benjamin Peterson14339b62009-01-31 16:36:08 +00002461 switch (*f) {
2462 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002463 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 const int ordinal = va_arg(vargs, int);
2465 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002466 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002467 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002468 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002470 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002471 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 case 'p':
2473 /* unused, since we already have the result */
2474 if (*f == 'p')
2475 (void) va_arg(vargs, void *);
2476 else
2477 (void) va_arg(vargs, int);
2478 /* extract the result from numberresults and append. */
2479 for (; *numberresult; ++i, ++numberresult)
2480 PyUnicode_WRITE(kind, data, i, *numberresult);
2481 /* skip over the separating '\0' */
2482 assert(*numberresult == '\0');
2483 numberresult++;
2484 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002485 break;
2486 case 's':
2487 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002488 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002490 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 size = PyUnicode_GET_LENGTH(*callresult);
2492 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002493 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002495 /* We're done with the unicode()/repr() => forget it */
2496 Py_DECREF(*callresult);
2497 /* switch to next unicode()/repr() result */
2498 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 break;
2500 }
2501 case 'U':
2502 {
2503 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 Py_ssize_t size;
2505 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2506 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002507 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002509 break;
2510 }
2511 case 'V':
2512 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002514 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002515 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 size = PyUnicode_GET_LENGTH(obj);
2518 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002519 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002521 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 size = PyUnicode_GET_LENGTH(*callresult);
2523 assert(PyUnicode_KIND(*callresult) <=
2524 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002525 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002527 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002528 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002529 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 break;
2531 }
2532 case 'S':
2533 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002534 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002536 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 /* unused, since we already have the result */
2538 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002540 copy_characters(string, i, *callresult, 0, size);
2541 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 /* We're done with the unicode()/repr() => forget it */
2543 Py_DECREF(*callresult);
2544 /* switch to next unicode()/repr() result */
2545 ++callresult;
2546 break;
2547 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002550 break;
2551 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 for (; *p; ++p, ++i)
2553 PyUnicode_WRITE(kind, data, i, *p);
2554 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 goto end;
2556 }
Victor Stinner1205f272010-09-11 00:54:47 +00002557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 else {
2559 assert(i < PyUnicode_GET_LENGTH(string));
2560 PyUnicode_WRITE(kind, data, i++, *f);
2561 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002564
Benjamin Peterson29060642009-01-31 22:14:21 +00002565 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002566 if (callresults)
2567 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 if (numberresults)
2569 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002570 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002572 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002573 if (callresults) {
2574 PyObject **callresult2 = callresults;
2575 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002576 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 ++callresult2;
2578 }
2579 PyObject_Free(callresults);
2580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 if (numberresults)
2582 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002584}
2585
Walter Dörwaldd2034312007-05-18 16:29:38 +00002586PyObject *
2587PyUnicode_FromFormat(const char *format, ...)
2588{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 PyObject* ret;
2590 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002591
2592#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002593 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002594#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002595 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002596#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 ret = PyUnicode_FromFormatV(format, vargs);
2598 va_end(vargs);
2599 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002600}
2601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602#ifdef HAVE_WCHAR_H
2603
Victor Stinner5593d8a2010-10-02 11:11:27 +00002604/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2605 convert a Unicode object to a wide character string.
2606
Victor Stinnerd88d9832011-09-06 02:00:05 +02002607 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002608 character) required to convert the unicode object. Ignore size argument.
2609
Victor Stinnerd88d9832011-09-06 02:00:05 +02002610 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002611 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002612 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002613static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002614unicode_aswidechar(PyUnicodeObject *unicode,
2615 wchar_t *w,
2616 Py_ssize_t size)
2617{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002618 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 const wchar_t *wstr;
2620
2621 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2622 if (wstr == NULL)
2623 return -1;
2624
Victor Stinner5593d8a2010-10-02 11:11:27 +00002625 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002626 if (size > res)
2627 size = res + 1;
2628 else
2629 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002631 return res;
2632 }
2633 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002635}
2636
2637Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002638PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002639 wchar_t *w,
2640 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641{
2642 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002643 PyErr_BadInternalCall();
2644 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002646 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647}
2648
Victor Stinner137c34c2010-09-29 10:25:54 +00002649wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002650PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002651 Py_ssize_t *size)
2652{
2653 wchar_t* buffer;
2654 Py_ssize_t buflen;
2655
2656 if (unicode == NULL) {
2657 PyErr_BadInternalCall();
2658 return NULL;
2659 }
2660
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002661 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002662 if (buflen == -1)
2663 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002664 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002665 PyErr_NoMemory();
2666 return NULL;
2667 }
2668
Victor Stinner137c34c2010-09-29 10:25:54 +00002669 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2670 if (buffer == NULL) {
2671 PyErr_NoMemory();
2672 return NULL;
2673 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002674 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 if (buflen == -1)
2676 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002677 if (size != NULL)
2678 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002679 return buffer;
2680}
2681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
Alexander Belopolsky40018472011-02-26 01:02:56 +00002684PyObject *
2685PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002688 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 PyErr_SetString(PyExc_ValueError,
2690 "chr() arg not in range(0x110000)");
2691 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002692 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 if (ordinal < 256)
2695 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 v = PyUnicode_New(1, ordinal);
2698 if (v == NULL)
2699 return NULL;
2700 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002701 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002703}
2704
Alexander Belopolsky40018472011-02-26 01:02:56 +00002705PyObject *
2706PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002708 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002710 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002711 if (PyUnicode_READY(obj))
2712 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002713 Py_INCREF(obj);
2714 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002715 }
2716 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002717 /* For a Unicode subtype that's not a Unicode object,
2718 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002719 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002720 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002721 PyErr_Format(PyExc_TypeError,
2722 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002723 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002724 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002725}
2726
Alexander Belopolsky40018472011-02-26 01:02:56 +00002727PyObject *
2728PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002729 const char *encoding,
2730 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002731{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002732 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002733 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002734
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002736 PyErr_BadInternalCall();
2737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002739
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002740 /* Decoding bytes objects is the most common case and should be fast */
2741 if (PyBytes_Check(obj)) {
2742 if (PyBytes_GET_SIZE(obj) == 0) {
2743 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002744 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002745 }
2746 else {
2747 v = PyUnicode_Decode(
2748 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2749 encoding, errors);
2750 }
2751 return v;
2752 }
2753
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002754 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 PyErr_SetString(PyExc_TypeError,
2756 "decoding str is not supported");
2757 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002758 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002759
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002760 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2761 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2762 PyErr_Format(PyExc_TypeError,
2763 "coercing to str: need bytes, bytearray "
2764 "or buffer-like object, %.80s found",
2765 Py_TYPE(obj)->tp_name);
2766 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002767 }
Tim Petersced69f82003-09-16 20:30:58 +00002768
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002769 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002771 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 }
Tim Petersced69f82003-09-16 20:30:58 +00002773 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002774 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002775
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002776 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002777 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778}
2779
Victor Stinner600d3be2010-06-10 12:00:55 +00002780/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002781 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2782 1 on success. */
2783static int
2784normalize_encoding(const char *encoding,
2785 char *lower,
2786 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002788 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002789 char *l;
2790 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002792 e = encoding;
2793 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002794 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002795 while (*e) {
2796 if (l == l_end)
2797 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002798 if (Py_ISUPPER(*e)) {
2799 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002800 }
2801 else if (*e == '_') {
2802 *l++ = '-';
2803 e++;
2804 }
2805 else {
2806 *l++ = *e++;
2807 }
2808 }
2809 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002810 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002811}
2812
Alexander Belopolsky40018472011-02-26 01:02:56 +00002813PyObject *
2814PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002815 Py_ssize_t size,
2816 const char *encoding,
2817 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002818{
2819 PyObject *buffer = NULL, *unicode;
2820 Py_buffer info;
2821 char lower[11]; /* Enough for any encoding shortcut */
2822
2823 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002824 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002825
2826 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002827 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002828 if ((strcmp(lower, "utf-8") == 0) ||
2829 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002830 return PyUnicode_DecodeUTF8(s, size, errors);
2831 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002832 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002833 (strcmp(lower, "iso-8859-1") == 0))
2834 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002835#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002836 else if (strcmp(lower, "mbcs") == 0)
2837 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002838#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002839 else if (strcmp(lower, "ascii") == 0)
2840 return PyUnicode_DecodeASCII(s, size, errors);
2841 else if (strcmp(lower, "utf-16") == 0)
2842 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2843 else if (strcmp(lower, "utf-32") == 0)
2844 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846
2847 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002848 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002849 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002850 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002851 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 if (buffer == NULL)
2853 goto onError;
2854 unicode = PyCodec_Decode(buffer, encoding, errors);
2855 if (unicode == NULL)
2856 goto onError;
2857 if (!PyUnicode_Check(unicode)) {
2858 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002859 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002860 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 Py_DECREF(unicode);
2862 goto onError;
2863 }
2864 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002865#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002866 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002867 Py_DECREF(unicode);
2868 return NULL;
2869 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002870#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002871 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002873
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 Py_XDECREF(buffer);
2876 return NULL;
2877}
2878
Alexander Belopolsky40018472011-02-26 01:02:56 +00002879PyObject *
2880PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002881 const char *encoding,
2882 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002883{
2884 PyObject *v;
2885
2886 if (!PyUnicode_Check(unicode)) {
2887 PyErr_BadArgument();
2888 goto onError;
2889 }
2890
2891 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002893
2894 /* Decode via the codec registry */
2895 v = PyCodec_Decode(unicode, encoding, errors);
2896 if (v == NULL)
2897 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002898 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002899 return v;
2900
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002902 return NULL;
2903}
2904
Alexander Belopolsky40018472011-02-26 01:02:56 +00002905PyObject *
2906PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002907 const char *encoding,
2908 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002909{
2910 PyObject *v;
2911
2912 if (!PyUnicode_Check(unicode)) {
2913 PyErr_BadArgument();
2914 goto onError;
2915 }
2916
2917 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002919
2920 /* Decode via the codec registry */
2921 v = PyCodec_Decode(unicode, encoding, errors);
2922 if (v == NULL)
2923 goto onError;
2924 if (!PyUnicode_Check(v)) {
2925 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002926 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002927 Py_TYPE(v)->tp_name);
2928 Py_DECREF(v);
2929 goto onError;
2930 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002931 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002932 return v;
2933
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002935 return NULL;
2936}
2937
Alexander Belopolsky40018472011-02-26 01:02:56 +00002938PyObject *
2939PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002940 Py_ssize_t size,
2941 const char *encoding,
2942 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943{
2944 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002945
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 unicode = PyUnicode_FromUnicode(s, size);
2947 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2950 Py_DECREF(unicode);
2951 return v;
2952}
2953
Alexander Belopolsky40018472011-02-26 01:02:56 +00002954PyObject *
2955PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002956 const char *encoding,
2957 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002958{
2959 PyObject *v;
2960
2961 if (!PyUnicode_Check(unicode)) {
2962 PyErr_BadArgument();
2963 goto onError;
2964 }
2965
2966 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002968
2969 /* Encode via the codec registry */
2970 v = PyCodec_Encode(unicode, encoding, errors);
2971 if (v == NULL)
2972 goto onError;
2973 return v;
2974
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976 return NULL;
2977}
2978
Victor Stinnerad158722010-10-27 00:25:46 +00002979PyObject *
2980PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002981{
Victor Stinner99b95382011-07-04 14:23:54 +02002982#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002983 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2984 PyUnicode_GET_SIZE(unicode),
2985 NULL);
2986#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002987 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002988#else
Victor Stinner793b5312011-04-27 00:24:21 +02002989 PyInterpreterState *interp = PyThreadState_GET()->interp;
2990 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2991 cannot use it to encode and decode filenames before it is loaded. Load
2992 the Python codec requires to encode at least its own filename. Use the C
2993 version of the locale codec until the codec registry is initialized and
2994 the Python codec is loaded.
2995
2996 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2997 cannot only rely on it: check also interp->fscodec_initialized for
2998 subinterpreters. */
2999 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003000 return PyUnicode_AsEncodedString(unicode,
3001 Py_FileSystemDefaultEncoding,
3002 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003003 }
3004 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003005 /* locale encoding with surrogateescape */
3006 wchar_t *wchar;
3007 char *bytes;
3008 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003009 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003010
3011 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3012 if (wchar == NULL)
3013 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003014 bytes = _Py_wchar2char(wchar, &error_pos);
3015 if (bytes == NULL) {
3016 if (error_pos != (size_t)-1) {
3017 char *errmsg = strerror(errno);
3018 PyObject *exc = NULL;
3019 if (errmsg == NULL)
3020 errmsg = "Py_wchar2char() failed";
3021 raise_encode_exception(&exc,
3022 "filesystemencoding",
3023 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3024 error_pos, error_pos+1,
3025 errmsg);
3026 Py_XDECREF(exc);
3027 }
3028 else
3029 PyErr_NoMemory();
3030 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003031 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003032 }
3033 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003034
3035 bytes_obj = PyBytes_FromString(bytes);
3036 PyMem_Free(bytes);
3037 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003038 }
Victor Stinnerad158722010-10-27 00:25:46 +00003039#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003040}
3041
Alexander Belopolsky40018472011-02-26 01:02:56 +00003042PyObject *
3043PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003044 const char *encoding,
3045 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046{
3047 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003048 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003049
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 if (!PyUnicode_Check(unicode)) {
3051 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 }
Fred Drakee4315f52000-05-09 19:53:39 +00003054
Victor Stinner2f283c22011-03-02 01:21:46 +00003055 if (encoding == NULL) {
3056 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003057 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003058 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003060 }
Fred Drakee4315f52000-05-09 19:53:39 +00003061
3062 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003063 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003064 if ((strcmp(lower, "utf-8") == 0) ||
3065 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003066 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003067 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003068 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003069 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003070 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003071 }
Victor Stinner37296e82010-06-10 13:36:23 +00003072 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003073 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003074 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003075 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003076#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003077 else if (strcmp(lower, "mbcs") == 0)
3078 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3079 PyUnicode_GET_SIZE(unicode),
3080 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003081#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003082 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003083 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085
3086 /* Encode via the codec registry */
3087 v = PyCodec_Encode(unicode, encoding, errors);
3088 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003089 return NULL;
3090
3091 /* The normal path */
3092 if (PyBytes_Check(v))
3093 return v;
3094
3095 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003096 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003097 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003098 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003099
3100 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3101 "encoder %s returned bytearray instead of bytes",
3102 encoding);
3103 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003104 Py_DECREF(v);
3105 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003107
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003108 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3109 Py_DECREF(v);
3110 return b;
3111 }
3112
3113 PyErr_Format(PyExc_TypeError,
3114 "encoder did not return a bytes object (type=%.400s)",
3115 Py_TYPE(v)->tp_name);
3116 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003117 return NULL;
3118}
3119
Alexander Belopolsky40018472011-02-26 01:02:56 +00003120PyObject *
3121PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003122 const char *encoding,
3123 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003124{
3125 PyObject *v;
3126
3127 if (!PyUnicode_Check(unicode)) {
3128 PyErr_BadArgument();
3129 goto onError;
3130 }
3131
3132 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003133 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003134
3135 /* Encode via the codec registry */
3136 v = PyCodec_Encode(unicode, encoding, errors);
3137 if (v == NULL)
3138 goto onError;
3139 if (!PyUnicode_Check(v)) {
3140 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003141 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003142 Py_TYPE(v)->tp_name);
3143 Py_DECREF(v);
3144 goto onError;
3145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003147
Benjamin Peterson29060642009-01-31 22:14:21 +00003148 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 return NULL;
3150}
3151
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003152PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003153PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003154 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003155 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3156}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003157
Christian Heimes5894ba72007-11-04 11:43:14 +00003158PyObject*
3159PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3160{
Victor Stinner99b95382011-07-04 14:23:54 +02003161#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003162 return PyUnicode_DecodeMBCS(s, size, NULL);
3163#elif defined(__APPLE__)
3164 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3165#else
Victor Stinner793b5312011-04-27 00:24:21 +02003166 PyInterpreterState *interp = PyThreadState_GET()->interp;
3167 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3168 cannot use it to encode and decode filenames before it is loaded. Load
3169 the Python codec requires to encode at least its own filename. Use the C
3170 version of the locale codec until the codec registry is initialized and
3171 the Python codec is loaded.
3172
3173 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3174 cannot only rely on it: check also interp->fscodec_initialized for
3175 subinterpreters. */
3176 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003177 return PyUnicode_Decode(s, size,
3178 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003179 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003180 }
3181 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003182 /* locale encoding with surrogateescape */
3183 wchar_t *wchar;
3184 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003185 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003186
3187 if (s[size] != '\0' || size != strlen(s)) {
3188 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3189 return NULL;
3190 }
3191
Victor Stinner168e1172010-10-16 23:16:16 +00003192 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003193 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003194 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003195
Victor Stinner168e1172010-10-16 23:16:16 +00003196 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003197 PyMem_Free(wchar);
3198 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003199 }
Victor Stinnerad158722010-10-27 00:25:46 +00003200#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003201}
3202
Martin v. Löwis011e8422009-05-05 04:43:17 +00003203
3204int
3205PyUnicode_FSConverter(PyObject* arg, void* addr)
3206{
3207 PyObject *output = NULL;
3208 Py_ssize_t size;
3209 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003210 if (arg == NULL) {
3211 Py_DECREF(*(PyObject**)addr);
3212 return 1;
3213 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003214 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003215 output = arg;
3216 Py_INCREF(output);
3217 }
3218 else {
3219 arg = PyUnicode_FromObject(arg);
3220 if (!arg)
3221 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003222 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003223 Py_DECREF(arg);
3224 if (!output)
3225 return 0;
3226 if (!PyBytes_Check(output)) {
3227 Py_DECREF(output);
3228 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3229 return 0;
3230 }
3231 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003232 size = PyBytes_GET_SIZE(output);
3233 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003234 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003235 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003236 Py_DECREF(output);
3237 return 0;
3238 }
3239 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003240 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003241}
3242
3243
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003244int
3245PyUnicode_FSDecoder(PyObject* arg, void* addr)
3246{
3247 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003248 if (arg == NULL) {
3249 Py_DECREF(*(PyObject**)addr);
3250 return 1;
3251 }
3252 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003253 if (PyUnicode_READY(arg))
3254 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003255 output = arg;
3256 Py_INCREF(output);
3257 }
3258 else {
3259 arg = PyBytes_FromObject(arg);
3260 if (!arg)
3261 return 0;
3262 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3263 PyBytes_GET_SIZE(arg));
3264 Py_DECREF(arg);
3265 if (!output)
3266 return 0;
3267 if (!PyUnicode_Check(output)) {
3268 Py_DECREF(output);
3269 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3270 return 0;
3271 }
3272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003273 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3274 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003275 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3276 Py_DECREF(output);
3277 return 0;
3278 }
3279 *(PyObject**)addr = output;
3280 return Py_CLEANUP_SUPPORTED;
3281}
3282
3283
Martin v. Löwis5b222132007-06-10 09:51:05 +00003284char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003285PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003286{
Christian Heimesf3863112007-11-22 07:46:41 +00003287 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003288 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3289
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003290 if (!PyUnicode_Check(unicode)) {
3291 PyErr_BadArgument();
3292 return NULL;
3293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003294 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003295 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003296
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003297 if (PyUnicode_UTF8(unicode) == NULL) {
3298 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003299 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3300 if (bytes == NULL)
3301 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003302 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3303 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003304 Py_DECREF(bytes);
3305 return NULL;
3306 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003307 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3308 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003309 Py_DECREF(bytes);
3310 }
3311
3312 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003313 *psize = PyUnicode_UTF8_LENGTH(unicode);
3314 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003315}
3316
3317char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003318PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003320 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3321}
3322
3323#ifdef Py_DEBUG
3324int unicode_as_unicode_calls = 0;
3325#endif
3326
3327
3328Py_UNICODE *
3329PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3330{
3331 PyUnicodeObject *u;
3332 const unsigned char *one_byte;
3333#if SIZEOF_WCHAR_T == 4
3334 const Py_UCS2 *two_bytes;
3335#else
3336 const Py_UCS4 *four_bytes;
3337 const Py_UCS4 *ucs4_end;
3338 Py_ssize_t num_surrogates;
3339#endif
3340 wchar_t *w;
3341 wchar_t *wchar_end;
3342
3343 if (!PyUnicode_Check(unicode)) {
3344 PyErr_BadArgument();
3345 return NULL;
3346 }
3347 u = (PyUnicodeObject*)unicode;
3348 if (_PyUnicode_WSTR(u) == NULL) {
3349 /* Non-ASCII compact unicode object */
3350 assert(_PyUnicode_KIND(u) != 0);
3351 assert(PyUnicode_IS_READY(u));
3352
3353#ifdef Py_DEBUG
3354 ++unicode_as_unicode_calls;
3355#endif
3356
3357 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3358#if SIZEOF_WCHAR_T == 2
3359 four_bytes = PyUnicode_4BYTE_DATA(u);
3360 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3361 num_surrogates = 0;
3362
3363 for (; four_bytes < ucs4_end; ++four_bytes) {
3364 if (*four_bytes > 0xFFFF)
3365 ++num_surrogates;
3366 }
3367
3368 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3369 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3370 if (!_PyUnicode_WSTR(u)) {
3371 PyErr_NoMemory();
3372 return NULL;
3373 }
3374 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3375
3376 w = _PyUnicode_WSTR(u);
3377 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3378 four_bytes = PyUnicode_4BYTE_DATA(u);
3379 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3380 if (*four_bytes > 0xFFFF) {
3381 /* encode surrogate pair in this case */
3382 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3383 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3384 }
3385 else
3386 *w = *four_bytes;
3387
3388 if (w > wchar_end) {
3389 assert(0 && "Miscalculated string end");
3390 }
3391 }
3392 *w = 0;
3393#else
3394 /* sizeof(wchar_t) == 4 */
3395 Py_FatalError("Impossible unicode object state, wstr and str "
3396 "should share memory already.");
3397 return NULL;
3398#endif
3399 }
3400 else {
3401 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3402 (_PyUnicode_LENGTH(u) + 1));
3403 if (!_PyUnicode_WSTR(u)) {
3404 PyErr_NoMemory();
3405 return NULL;
3406 }
3407 if (!PyUnicode_IS_COMPACT_ASCII(u))
3408 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3409 w = _PyUnicode_WSTR(u);
3410 wchar_end = w + _PyUnicode_LENGTH(u);
3411
3412 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3413 one_byte = PyUnicode_1BYTE_DATA(u);
3414 for (; w < wchar_end; ++one_byte, ++w)
3415 *w = *one_byte;
3416 /* null-terminate the wstr */
3417 *w = 0;
3418 }
3419 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3420#if SIZEOF_WCHAR_T == 4
3421 two_bytes = PyUnicode_2BYTE_DATA(u);
3422 for (; w < wchar_end; ++two_bytes, ++w)
3423 *w = *two_bytes;
3424 /* null-terminate the wstr */
3425 *w = 0;
3426#else
3427 /* sizeof(wchar_t) == 2 */
3428 PyObject_FREE(_PyUnicode_WSTR(u));
3429 _PyUnicode_WSTR(u) = NULL;
3430 Py_FatalError("Impossible unicode object state, wstr "
3431 "and str should share memory already.");
3432 return NULL;
3433#endif
3434 }
3435 else {
3436 assert(0 && "This should never happen.");
3437 }
3438 }
3439 }
3440 if (size != NULL)
3441 *size = PyUnicode_WSTR_LENGTH(u);
3442 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003443}
3444
Alexander Belopolsky40018472011-02-26 01:02:56 +00003445Py_UNICODE *
3446PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449}
3450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451
Alexander Belopolsky40018472011-02-26 01:02:56 +00003452Py_ssize_t
3453PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454{
3455 if (!PyUnicode_Check(unicode)) {
3456 PyErr_BadArgument();
3457 goto onError;
3458 }
3459 return PyUnicode_GET_SIZE(unicode);
3460
Benjamin Peterson29060642009-01-31 22:14:21 +00003461 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 return -1;
3463}
3464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465Py_ssize_t
3466PyUnicode_GetLength(PyObject *unicode)
3467{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003468 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469 PyErr_BadArgument();
3470 return -1;
3471 }
3472
3473 return PyUnicode_GET_LENGTH(unicode);
3474}
3475
3476Py_UCS4
3477PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3478{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003479 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3480 PyErr_BadArgument();
3481 return (Py_UCS4)-1;
3482 }
3483 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3484 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003485 return (Py_UCS4)-1;
3486 }
3487 return PyUnicode_READ_CHAR(unicode, index);
3488}
3489
3490int
3491PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3492{
3493 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003494 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003495 return -1;
3496 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003497 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3498 PyErr_SetString(PyExc_IndexError, "string index out of range");
3499 return -1;
3500 }
3501 if (_PyUnicode_Dirty(unicode))
3502 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003503 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3504 index, ch);
3505 return 0;
3506}
3507
Alexander Belopolsky40018472011-02-26 01:02:56 +00003508const char *
3509PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003510{
Victor Stinner42cb4622010-09-01 19:39:01 +00003511 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003512}
3513
Victor Stinner554f3f02010-06-16 23:33:54 +00003514/* create or adjust a UnicodeDecodeError */
3515static void
3516make_decode_exception(PyObject **exceptionObject,
3517 const char *encoding,
3518 const char *input, Py_ssize_t length,
3519 Py_ssize_t startpos, Py_ssize_t endpos,
3520 const char *reason)
3521{
3522 if (*exceptionObject == NULL) {
3523 *exceptionObject = PyUnicodeDecodeError_Create(
3524 encoding, input, length, startpos, endpos, reason);
3525 }
3526 else {
3527 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3528 goto onError;
3529 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3530 goto onError;
3531 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3532 goto onError;
3533 }
3534 return;
3535
3536onError:
3537 Py_DECREF(*exceptionObject);
3538 *exceptionObject = NULL;
3539}
3540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541/* error handling callback helper:
3542 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003543 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 and adjust various state variables.
3545 return 0 on success, -1 on error
3546*/
3547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548static int
3549unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003550 const char *encoding, const char *reason,
3551 const char **input, const char **inend, Py_ssize_t *startinpos,
3552 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3553 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003555 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556
3557 PyObject *restuple = NULL;
3558 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003559 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003560 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003561 Py_ssize_t requiredsize;
3562 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003564 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 int res = -1;
3567
3568 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 *errorHandler = PyCodec_LookupError(errors);
3570 if (*errorHandler == NULL)
3571 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 }
3573
Victor Stinner554f3f02010-06-16 23:33:54 +00003574 make_decode_exception(exceptionObject,
3575 encoding,
3576 *input, *inend - *input,
3577 *startinpos, *endinpos,
3578 reason);
3579 if (*exceptionObject == NULL)
3580 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581
3582 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3583 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003586 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 }
3589 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003591
3592 /* Copy back the bytes variables, which might have been modified by the
3593 callback */
3594 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3595 if (!inputobj)
3596 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003597 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003599 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003600 *input = PyBytes_AS_STRING(inputobj);
3601 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003602 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003603 /* we can DECREF safely, as the exception has another reference,
3604 so the object won't go away. */
3605 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003608 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003609 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3611 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003612 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613
3614 /* need more space? (at least enough for what we
3615 have+the replacement+the rest of the string (starting
3616 at the new input position), so we won't have to check space
3617 when there are no errors in the rest of the string) */
3618 repptr = PyUnicode_AS_UNICODE(repunicode);
3619 repsize = PyUnicode_GET_SIZE(repunicode);
3620 requiredsize = *outpos + repsize + insize-newpos;
3621 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 if (requiredsize<2*outsize)
3623 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003624 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003625 goto onError;
3626 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 }
3628 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003629 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630 Py_UNICODE_COPY(*outptr, repptr, repsize);
3631 *outptr += repsize;
3632 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 /* we made it! */
3635 res = 0;
3636
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 Py_XDECREF(restuple);
3639 return res;
3640}
3641
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003642/* --- UTF-7 Codec -------------------------------------------------------- */
3643
Antoine Pitrou244651a2009-05-04 18:56:13 +00003644/* See RFC2152 for details. We encode conservatively and decode liberally. */
3645
3646/* Three simple macros defining base-64. */
3647
3648/* Is c a base-64 character? */
3649
3650#define IS_BASE64(c) \
3651 (((c) >= 'A' && (c) <= 'Z') || \
3652 ((c) >= 'a' && (c) <= 'z') || \
3653 ((c) >= '0' && (c) <= '9') || \
3654 (c) == '+' || (c) == '/')
3655
3656/* given that c is a base-64 character, what is its base-64 value? */
3657
3658#define FROM_BASE64(c) \
3659 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3660 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3661 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3662 (c) == '+' ? 62 : 63)
3663
3664/* What is the base-64 character of the bottom 6 bits of n? */
3665
3666#define TO_BASE64(n) \
3667 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3668
3669/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3670 * decoded as itself. We are permissive on decoding; the only ASCII
3671 * byte not decoding to itself is the + which begins a base64
3672 * string. */
3673
3674#define DECODE_DIRECT(c) \
3675 ((c) <= 127 && (c) != '+')
3676
3677/* The UTF-7 encoder treats ASCII characters differently according to
3678 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3679 * the above). See RFC2152. This array identifies these different
3680 * sets:
3681 * 0 : "Set D"
3682 * alphanumeric and '(),-./:?
3683 * 1 : "Set O"
3684 * !"#$%&*;<=>@[]^_`{|}
3685 * 2 : "whitespace"
3686 * ht nl cr sp
3687 * 3 : special (must be base64 encoded)
3688 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3689 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003690
Tim Petersced69f82003-09-16 20:30:58 +00003691static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003692char utf7_category[128] = {
3693/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3694 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3695/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3696 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3697/* sp ! " # $ % & ' ( ) * + , - . / */
3698 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3699/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3701/* @ A B C D E F G H I J K L M N O */
3702 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3703/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3705/* ` a b c d e f g h i j k l m n o */
3706 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3707/* p q r s t u v w x y z { | } ~ del */
3708 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003709};
3710
Antoine Pitrou244651a2009-05-04 18:56:13 +00003711/* ENCODE_DIRECT: this character should be encoded as itself. The
3712 * answer depends on whether we are encoding set O as itself, and also
3713 * on whether we are encoding whitespace as itself. RFC2152 makes it
3714 * clear that the answers to these questions vary between
3715 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003716
Antoine Pitrou244651a2009-05-04 18:56:13 +00003717#define ENCODE_DIRECT(c, directO, directWS) \
3718 ((c) < 128 && (c) > 0 && \
3719 ((utf7_category[(c)] == 0) || \
3720 (directWS && (utf7_category[(c)] == 2)) || \
3721 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003722
Alexander Belopolsky40018472011-02-26 01:02:56 +00003723PyObject *
3724PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003725 Py_ssize_t size,
3726 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003727{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003728 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3729}
3730
Antoine Pitrou244651a2009-05-04 18:56:13 +00003731/* The decoder. The only state we preserve is our read position,
3732 * i.e. how many characters we have consumed. So if we end in the
3733 * middle of a shift sequence we have to back off the read position
3734 * and the output to the beginning of the sequence, otherwise we lose
3735 * all the shift state (seen bits, number of bits seen, high
3736 * surrogate). */
3737
Alexander Belopolsky40018472011-02-26 01:02:56 +00003738PyObject *
3739PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003740 Py_ssize_t size,
3741 const char *errors,
3742 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t startinpos;
3746 Py_ssize_t endinpos;
3747 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003748 const char *e;
3749 PyUnicodeObject *unicode;
3750 Py_UNICODE *p;
3751 const char *errmsg = "";
3752 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003753 Py_UNICODE *shiftOutStart;
3754 unsigned int base64bits = 0;
3755 unsigned long base64buffer = 0;
3756 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 PyObject *errorHandler = NULL;
3758 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003759
3760 unicode = _PyUnicode_New(size);
3761 if (!unicode)
3762 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003763 if (size == 0) {
3764 if (consumed)
3765 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003766 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003767 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003770 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003771 e = s + size;
3772
3773 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003775 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003776 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003777
Antoine Pitrou244651a2009-05-04 18:56:13 +00003778 if (inShift) { /* in a base-64 section */
3779 if (IS_BASE64(ch)) { /* consume a base-64 character */
3780 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3781 base64bits += 6;
3782 s++;
3783 if (base64bits >= 16) {
3784 /* we have enough bits for a UTF-16 value */
3785 Py_UNICODE outCh = (Py_UNICODE)
3786 (base64buffer >> (base64bits-16));
3787 base64bits -= 16;
3788 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3789 if (surrogate) {
3790 /* expecting a second surrogate */
3791 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3792#ifdef Py_UNICODE_WIDE
3793 *p++ = (((surrogate & 0x3FF)<<10)
3794 | (outCh & 0x3FF)) + 0x10000;
3795#else
3796 *p++ = surrogate;
3797 *p++ = outCh;
3798#endif
3799 surrogate = 0;
3800 }
3801 else {
3802 surrogate = 0;
3803 errmsg = "second surrogate missing";
3804 goto utf7Error;
3805 }
3806 }
3807 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3808 /* first surrogate */
3809 surrogate = outCh;
3810 }
3811 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3812 errmsg = "unexpected second surrogate";
3813 goto utf7Error;
3814 }
3815 else {
3816 *p++ = outCh;
3817 }
3818 }
3819 }
3820 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003821 inShift = 0;
3822 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003823 if (surrogate) {
3824 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003825 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003826 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003827 if (base64bits > 0) { /* left-over bits */
3828 if (base64bits >= 6) {
3829 /* We've seen at least one base-64 character */
3830 errmsg = "partial character in shift sequence";
3831 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003832 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003833 else {
3834 /* Some bits remain; they should be zero */
3835 if (base64buffer != 0) {
3836 errmsg = "non-zero padding bits in shift sequence";
3837 goto utf7Error;
3838 }
3839 }
3840 }
3841 if (ch != '-') {
3842 /* '-' is absorbed; other terminating
3843 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003844 *p++ = ch;
3845 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003846 }
3847 }
3848 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003850 s++; /* consume '+' */
3851 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852 s++;
3853 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003854 }
3855 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003856 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003857 shiftOutStart = p;
3858 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003859 }
3860 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003861 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003862 *p++ = ch;
3863 s++;
3864 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003865 else {
3866 startinpos = s-starts;
3867 s++;
3868 errmsg = "unexpected special character";
3869 goto utf7Error;
3870 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003871 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003872utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 outpos = p-PyUnicode_AS_UNICODE(unicode);
3874 endinpos = s-starts;
3875 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003876 errors, &errorHandler,
3877 "utf7", errmsg,
3878 &starts, &e, &startinpos, &endinpos, &exc, &s,
3879 &unicode, &outpos, &p))
3880 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003881 }
3882
Antoine Pitrou244651a2009-05-04 18:56:13 +00003883 /* end of string */
3884
3885 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3886 /* if we're in an inconsistent state, that's an error */
3887 if (surrogate ||
3888 (base64bits >= 6) ||
3889 (base64bits > 0 && base64buffer != 0)) {
3890 outpos = p-PyUnicode_AS_UNICODE(unicode);
3891 endinpos = size;
3892 if (unicode_decode_call_errorhandler(
3893 errors, &errorHandler,
3894 "utf7", "unterminated shift sequence",
3895 &starts, &e, &startinpos, &endinpos, &exc, &s,
3896 &unicode, &outpos, &p))
3897 goto onError;
3898 if (s < e)
3899 goto restart;
3900 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003901 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902
3903 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003904 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003905 if (inShift) {
3906 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003907 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003908 }
3909 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003910 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003911 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003912 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003913
Victor Stinnerfe226c02011-10-03 03:52:20 +02003914 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003915 goto onError;
3916
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 Py_XDECREF(errorHandler);
3918 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003919#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003920 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003921 Py_DECREF(unicode);
3922 return NULL;
3923 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003924#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003925 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003926 return (PyObject *)unicode;
3927
Benjamin Peterson29060642009-01-31 22:14:21 +00003928 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 Py_XDECREF(errorHandler);
3930 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003931 Py_DECREF(unicode);
3932 return NULL;
3933}
3934
3935
Alexander Belopolsky40018472011-02-26 01:02:56 +00003936PyObject *
3937PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003938 Py_ssize_t size,
3939 int base64SetO,
3940 int base64WhiteSpace,
3941 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003942{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003943 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003945 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003947 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 unsigned int base64bits = 0;
3949 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 char * out;
3951 char * start;
3952
3953 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003954 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003955
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003956 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003957 return PyErr_NoMemory();
3958
Antoine Pitrou244651a2009-05-04 18:56:13 +00003959 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003960 if (v == NULL)
3961 return NULL;
3962
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003963 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003964 for (;i < size; ++i) {
3965 Py_UNICODE ch = s[i];
3966
Antoine Pitrou244651a2009-05-04 18:56:13 +00003967 if (inShift) {
3968 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3969 /* shifting out */
3970 if (base64bits) { /* output remaining bits */
3971 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3972 base64buffer = 0;
3973 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 }
3975 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003976 /* Characters not in the BASE64 set implicitly unshift the sequence
3977 so no '-' is required, except if the character is itself a '-' */
3978 if (IS_BASE64(ch) || ch == '-') {
3979 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003981 *out++ = (char) ch;
3982 }
3983 else {
3984 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003985 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003986 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 else { /* not in a shift sequence */
3988 if (ch == '+') {
3989 *out++ = '+';
3990 *out++ = '-';
3991 }
3992 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3993 *out++ = (char) ch;
3994 }
3995 else {
3996 *out++ = '+';
3997 inShift = 1;
3998 goto encode_char;
3999 }
4000 }
4001 continue;
4002encode_char:
4003#ifdef Py_UNICODE_WIDE
4004 if (ch >= 0x10000) {
4005 /* code first surrogate */
4006 base64bits += 16;
4007 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4008 while (base64bits >= 6) {
4009 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4010 base64bits -= 6;
4011 }
4012 /* prepare second surrogate */
4013 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4014 }
4015#endif
4016 base64bits += 16;
4017 base64buffer = (base64buffer << 16) | ch;
4018 while (base64bits >= 6) {
4019 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4020 base64bits -= 6;
4021 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004022 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023 if (base64bits)
4024 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4025 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004026 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004027 if (_PyBytes_Resize(&v, out - start) < 0)
4028 return NULL;
4029 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030}
4031
Antoine Pitrou244651a2009-05-04 18:56:13 +00004032#undef IS_BASE64
4033#undef FROM_BASE64
4034#undef TO_BASE64
4035#undef DECODE_DIRECT
4036#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004037
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038/* --- UTF-8 Codec -------------------------------------------------------- */
4039
Tim Petersced69f82003-09-16 20:30:58 +00004040static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004042 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4043 illegal prefix. See RFC 3629 for details */
4044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4048 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4049 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4050 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004051 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4056 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4057 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4058 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4059 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060};
4061
Alexander Belopolsky40018472011-02-26 01:02:56 +00004062PyObject *
4063PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004064 Py_ssize_t size,
4065 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066{
Walter Dörwald69652032004-09-07 20:24:22 +00004067 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4068}
4069
Antoine Pitrouab868312009-01-10 15:40:25 +00004070/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4071#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4072
4073/* Mask to quickly check whether a C 'long' contains a
4074 non-ASCII, UTF8-encoded char. */
4075#if (SIZEOF_LONG == 8)
4076# define ASCII_CHAR_MASK 0x8080808080808080L
4077#elif (SIZEOF_LONG == 4)
4078# define ASCII_CHAR_MASK 0x80808080L
4079#else
4080# error C 'long' size should be either 4 or 8!
4081#endif
4082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083/* Scans a UTF-8 string and returns the maximum character to be expected,
4084 the size of the decoded unicode string and if any major errors were
4085 encountered.
4086
4087 This function does check basic UTF-8 sanity, it does however NOT CHECK
4088 if the string contains surrogates, and if all continuation bytes are
4089 within the correct ranges, these checks are performed in
4090 PyUnicode_DecodeUTF8Stateful.
4091
4092 If it sets has_errors to 1, it means the value of unicode_size and max_char
4093 will be bogus and you should not rely on useful information in them.
4094 */
4095static Py_UCS4
4096utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4097 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4098 int *has_errors)
4099{
4100 Py_ssize_t n;
4101 Py_ssize_t char_count = 0;
4102 Py_UCS4 max_char = 127, new_max;
4103 Py_UCS4 upper_bound;
4104 const unsigned char *p = (const unsigned char *)s;
4105 const unsigned char *end = p + string_size;
4106 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4107 int err = 0;
4108
4109 for (; p < end && !err; ++p, ++char_count) {
4110 /* Only check value if it's not a ASCII char... */
4111 if (*p < 0x80) {
4112 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4113 an explanation. */
4114 if (!((size_t) p & LONG_PTR_MASK)) {
4115 /* Help register allocation */
4116 register const unsigned char *_p = p;
4117 while (_p < aligned_end) {
4118 unsigned long value = *(unsigned long *) _p;
4119 if (value & ASCII_CHAR_MASK)
4120 break;
4121 _p += SIZEOF_LONG;
4122 char_count += SIZEOF_LONG;
4123 }
4124 p = _p;
4125 if (p == end)
4126 break;
4127 }
4128 }
4129 if (*p >= 0x80) {
4130 n = utf8_code_length[*p];
4131 new_max = max_char;
4132 switch (n) {
4133 /* invalid start byte */
4134 case 0:
4135 err = 1;
4136 break;
4137 case 2:
4138 /* Code points between 0x00FF and 0x07FF inclusive.
4139 Approximate the upper bound of the code point,
4140 if this flips over 255 we can be sure it will be more
4141 than 255 and the string will need 2 bytes per code coint,
4142 if it stays under or equal to 255, we can be sure 1 byte
4143 is enough.
4144 ((*p & 0b00011111) << 6) | 0b00111111 */
4145 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4146 if (max_char < upper_bound)
4147 new_max = upper_bound;
4148 /* Ensure we track at least that we left ASCII space. */
4149 if (new_max < 128)
4150 new_max = 128;
4151 break;
4152 case 3:
4153 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4154 always > 255 and <= 65535 and will always need 2 bytes. */
4155 if (max_char < 65535)
4156 new_max = 65535;
4157 break;
4158 case 4:
4159 /* Code point will be above 0xFFFF for sure in this case. */
4160 new_max = 65537;
4161 break;
4162 /* Internal error, this should be caught by the first if */
4163 case 1:
4164 default:
4165 assert(0 && "Impossible case in utf8_max_char_and_size");
4166 err = 1;
4167 }
4168 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004169 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170 --n;
4171 /* Check if the follow up chars are all valid continuation bytes */
4172 if (n >= 1) {
4173 const unsigned char *cont;
4174 if ((p + n) >= end) {
4175 if (consumed == 0)
4176 /* incomplete data, non-incremental decoding */
4177 err = 1;
4178 break;
4179 }
4180 for (cont = p + 1; cont < (p + n); ++cont) {
4181 if ((*cont & 0xc0) != 0x80) {
4182 err = 1;
4183 break;
4184 }
4185 }
4186 p += n;
4187 }
4188 else
4189 err = 1;
4190 max_char = new_max;
4191 }
4192 }
4193
4194 if (unicode_size)
4195 *unicode_size = char_count;
4196 if (has_errors)
4197 *has_errors = err;
4198 return max_char;
4199}
4200
4201/* Similar to PyUnicode_WRITE but can also write into wstr field
4202 of the legacy unicode representation */
4203#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4204 do { \
4205 const int k_ = (kind); \
4206 if (k_ == PyUnicode_WCHAR_KIND) \
4207 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4208 else if (k_ == PyUnicode_1BYTE_KIND) \
4209 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4210 else if (k_ == PyUnicode_2BYTE_KIND) \
4211 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4212 else \
4213 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4214 } while (0)
4215
Alexander Belopolsky40018472011-02-26 01:02:56 +00004216PyObject *
4217PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218 Py_ssize_t size,
4219 const char *errors,
4220 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004221{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004224 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004225 Py_ssize_t startinpos;
4226 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004227 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004229 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 PyObject *errorHandler = NULL;
4231 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 Py_UCS4 maxchar = 0;
4233 Py_ssize_t unicode_size;
4234 Py_ssize_t i;
4235 int kind;
4236 void *data;
4237 int has_errors;
4238 Py_UNICODE *error_outptr;
4239#if SIZEOF_WCHAR_T == 2
4240 Py_ssize_t wchar_offset = 0;
4241#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242
Walter Dörwald69652032004-09-07 20:24:22 +00004243 if (size == 0) {
4244 if (consumed)
4245 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4249 consumed, &has_errors);
4250 if (has_errors) {
4251 unicode = _PyUnicode_New(size);
4252 if (!unicode)
4253 return NULL;
4254 kind = PyUnicode_WCHAR_KIND;
4255 data = PyUnicode_AS_UNICODE(unicode);
4256 assert(data != NULL);
4257 }
4258 else {
4259 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4260 if (!unicode)
4261 return NULL;
4262 /* When the string is ASCII only, just use memcpy and return.
4263 unicode_size may be != size if there is an incomplete UTF-8
4264 sequence at the end of the ASCII block. */
4265 if (maxchar < 128 && size == unicode_size) {
4266 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4267 return (PyObject *)unicode;
4268 }
4269 kind = PyUnicode_KIND(unicode);
4270 data = PyUnicode_DATA(unicode);
4271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004275 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276
4277 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004278 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279
4280 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004281 /* Fast path for runs of ASCII characters. Given that common UTF-8
4282 input will consist of an overwhelming majority of ASCII
4283 characters, we try to optimize for this case by checking
4284 as many characters as a C 'long' can contain.
4285 First, check if we can do an aligned read, as most CPUs have
4286 a penalty for unaligned reads.
4287 */
4288 if (!((size_t) s & LONG_PTR_MASK)) {
4289 /* Help register allocation */
4290 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004292 while (_s < aligned_end) {
4293 /* Read a whole long at a time (either 4 or 8 bytes),
4294 and do a fast unrolled copy if it only contains ASCII
4295 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004296 unsigned long value = *(unsigned long *) _s;
4297 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004298 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004299 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4300 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4301 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4302 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004303#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004304 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4305 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4306 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4307 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004308#endif
4309 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004310 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004311 }
4312 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004313 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004314 if (s == e)
4315 break;
4316 ch = (unsigned char)*s;
4317 }
4318 }
4319
4320 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004321 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 s++;
4323 continue;
4324 }
4325
4326 n = utf8_code_length[ch];
4327
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004328 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 if (consumed)
4330 break;
4331 else {
4332 errmsg = "unexpected end of data";
4333 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004334 endinpos = startinpos+1;
4335 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4336 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 goto utf8Error;
4338 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340
4341 switch (n) {
4342
4343 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004344 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 startinpos = s-starts;
4346 endinpos = startinpos+1;
4347 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348
4349 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004350 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 startinpos = s-starts;
4352 endinpos = startinpos+1;
4353 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
4355 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004356 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004357 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004359 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 goto utf8Error;
4361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004363 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 break;
4366
4367 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004368 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4369 will result in surrogates in range d800-dfff. Surrogates are
4370 not valid UTF-8 so they are rejected.
4371 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4372 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004373 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004374 (s[2] & 0xc0) != 0x80 ||
4375 ((unsigned char)s[0] == 0xE0 &&
4376 (unsigned char)s[1] < 0xA0) ||
4377 ((unsigned char)s[0] == 0xED &&
4378 (unsigned char)s[1] > 0x9F)) {
4379 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004380 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004381 endinpos = startinpos + 1;
4382
4383 /* if s[1] first two bits are 1 and 0, then the invalid
4384 continuation byte is s[2], so increment endinpos by 1,
4385 if not, s[1] is invalid and endinpos doesn't need to
4386 be incremented. */
4387 if ((s[1] & 0xC0) == 0x80)
4388 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 goto utf8Error;
4390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004392 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004393 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004394 break;
4395
4396 case 4:
4397 if ((s[1] & 0xc0) != 0x80 ||
4398 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004399 (s[3] & 0xc0) != 0x80 ||
4400 ((unsigned char)s[0] == 0xF0 &&
4401 (unsigned char)s[1] < 0x90) ||
4402 ((unsigned char)s[0] == 0xF4 &&
4403 (unsigned char)s[1] > 0x8F)) {
4404 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004406 endinpos = startinpos + 1;
4407 if ((s[1] & 0xC0) == 0x80) {
4408 endinpos++;
4409 if ((s[2] & 0xC0) == 0x80)
4410 endinpos++;
4411 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto utf8Error;
4413 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004414 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004415 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4416 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004418 /* If the string is flexible or we have native UCS-4, write
4419 directly.. */
4420 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4421 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004423 else {
4424 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004426 /* translate from 10000..10FFFF to 0..FFFF */
4427 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004429 /* high surrogate = top 10 bits added to D800 */
4430 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4431 (Py_UNICODE)(0xD800 + (ch >> 10)));
4432
4433 /* low surrogate = bottom 10 bits added to DC00 */
4434 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4435 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4436 }
4437#if SIZEOF_WCHAR_T == 2
4438 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004439#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 }
4442 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004444
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004446 /* If this is not yet a resizable string, make it one.. */
4447 if (kind != PyUnicode_WCHAR_KIND) {
4448 const Py_UNICODE *u;
4449 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4450 if (!new_unicode)
4451 goto onError;
4452 u = PyUnicode_AsUnicode((PyObject *)unicode);
4453 if (!u)
4454 goto onError;
4455#if SIZEOF_WCHAR_T == 2
4456 i += wchar_offset;
4457#endif
4458 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4459 Py_DECREF(unicode);
4460 unicode = new_unicode;
4461 kind = 0;
4462 data = PyUnicode_AS_UNICODE(new_unicode);
4463 assert(data != NULL);
4464 }
4465 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 if (unicode_decode_call_errorhandler(
4467 errors, &errorHandler,
4468 "utf8", errmsg,
4469 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472 /* Update data because unicode_decode_call_errorhandler might have
4473 re-created or resized the unicode object. */
4474 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 /* Ensure the unicode_size calculation above was correct: */
4478 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4479
Walter Dörwald69652032004-09-07 20:24:22 +00004480 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004483 /* Adjust length and ready string when it contained errors and
4484 is of the old resizable kind. */
4485 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004486 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004487 goto onError;
4488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 Py_XDECREF(errorHandler);
4491 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004492#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004493 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004494 Py_DECREF(unicode);
4495 return NULL;
4496 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004497#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004498 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 return (PyObject *)unicode;
4500
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 Py_XDECREF(errorHandler);
4503 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 Py_DECREF(unicode);
4505 return NULL;
4506}
4507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004508#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004509
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004510#ifdef __APPLE__
4511
4512/* Simplified UTF-8 decoder using surrogateescape error handler,
4513 used to decode the command line arguments on Mac OS X. */
4514
4515wchar_t*
4516_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4517{
4518 int n;
4519 const char *e;
4520 wchar_t *unicode, *p;
4521
4522 /* Note: size will always be longer than the resulting Unicode
4523 character count */
4524 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4525 PyErr_NoMemory();
4526 return NULL;
4527 }
4528 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4529 if (!unicode)
4530 return NULL;
4531
4532 /* Unpack UTF-8 encoded data */
4533 p = unicode;
4534 e = s + size;
4535 while (s < e) {
4536 Py_UCS4 ch = (unsigned char)*s;
4537
4538 if (ch < 0x80) {
4539 *p++ = (wchar_t)ch;
4540 s++;
4541 continue;
4542 }
4543
4544 n = utf8_code_length[ch];
4545 if (s + n > e) {
4546 goto surrogateescape;
4547 }
4548
4549 switch (n) {
4550 case 0:
4551 case 1:
4552 goto surrogateescape;
4553
4554 case 2:
4555 if ((s[1] & 0xc0) != 0x80)
4556 goto surrogateescape;
4557 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4558 assert ((ch > 0x007F) && (ch <= 0x07FF));
4559 *p++ = (wchar_t)ch;
4560 break;
4561
4562 case 3:
4563 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4564 will result in surrogates in range d800-dfff. Surrogates are
4565 not valid UTF-8 so they are rejected.
4566 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4567 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4568 if ((s[1] & 0xc0) != 0x80 ||
4569 (s[2] & 0xc0) != 0x80 ||
4570 ((unsigned char)s[0] == 0xE0 &&
4571 (unsigned char)s[1] < 0xA0) ||
4572 ((unsigned char)s[0] == 0xED &&
4573 (unsigned char)s[1] > 0x9F)) {
4574
4575 goto surrogateescape;
4576 }
4577 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4578 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004580 break;
4581
4582 case 4:
4583 if ((s[1] & 0xc0) != 0x80 ||
4584 (s[2] & 0xc0) != 0x80 ||
4585 (s[3] & 0xc0) != 0x80 ||
4586 ((unsigned char)s[0] == 0xF0 &&
4587 (unsigned char)s[1] < 0x90) ||
4588 ((unsigned char)s[0] == 0xF4 &&
4589 (unsigned char)s[1] > 0x8F)) {
4590 goto surrogateescape;
4591 }
4592 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4593 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4594 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4595
4596#if SIZEOF_WCHAR_T == 4
4597 *p++ = (wchar_t)ch;
4598#else
4599 /* compute and append the two surrogates: */
4600
4601 /* translate from 10000..10FFFF to 0..FFFF */
4602 ch -= 0x10000;
4603
4604 /* high surrogate = top 10 bits added to D800 */
4605 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4606
4607 /* low surrogate = bottom 10 bits added to DC00 */
4608 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4609#endif
4610 break;
4611 }
4612 s += n;
4613 continue;
4614
4615 surrogateescape:
4616 *p++ = 0xDC00 + ch;
4617 s++;
4618 }
4619 *p = L'\0';
4620 return unicode;
4621}
4622
4623#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004625/* Primary internal function which creates utf8 encoded bytes objects.
4626
4627 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004628 and allocate exactly as much space needed at the end. Else allocate the
4629 maximum possible needed (4 result bytes per Unicode character), and return
4630 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004631*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004632PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004633_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634{
Tim Peters602f7402002-04-27 18:03:26 +00004635#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004636
Guido van Rossum98297ee2007-11-06 21:34:58 +00004637 Py_ssize_t i; /* index into s of next input byte */
4638 PyObject *result; /* result string object */
4639 char *p; /* next free byte in output buffer */
4640 Py_ssize_t nallocated; /* number of result bytes allocated */
4641 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004642 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004643 PyObject *errorHandler = NULL;
4644 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004645 int kind;
4646 void *data;
4647 Py_ssize_t size;
4648 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4649#if SIZEOF_WCHAR_T == 2
4650 Py_ssize_t wchar_offset = 0;
4651#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653 if (!PyUnicode_Check(unicode)) {
4654 PyErr_BadArgument();
4655 return NULL;
4656 }
4657
4658 if (PyUnicode_READY(unicode) == -1)
4659 return NULL;
4660
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004661 if (PyUnicode_UTF8(unicode))
4662 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4663 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004664
4665 kind = PyUnicode_KIND(unicode);
4666 data = PyUnicode_DATA(unicode);
4667 size = PyUnicode_GET_LENGTH(unicode);
4668
Tim Peters602f7402002-04-27 18:03:26 +00004669 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670
Tim Peters602f7402002-04-27 18:03:26 +00004671 if (size <= MAX_SHORT_UNICHARS) {
4672 /* Write into the stack buffer; nallocated can't overflow.
4673 * At the end, we'll allocate exactly as much heap space as it
4674 * turns out we need.
4675 */
4676 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004677 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004678 p = stackbuf;
4679 }
4680 else {
4681 /* Overallocate on the heap, and give the excess back at the end. */
4682 nallocated = size * 4;
4683 if (nallocated / 4 != size) /* overflow! */
4684 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004685 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004686 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004687 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004688 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004689 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004690
Tim Peters602f7402002-04-27 18:03:26 +00004691 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004692 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004693
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004694 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004695 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004697
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004699 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004700 *p++ = (char)(0xc0 | (ch >> 6));
4701 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004702 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004703 Py_ssize_t newpos;
4704 PyObject *rep;
4705 Py_ssize_t repsize, k, startpos;
4706 startpos = i-1;
4707#if SIZEOF_WCHAR_T == 2
4708 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004709#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004710 rep = unicode_encode_call_errorhandler(
4711 errors, &errorHandler, "utf-8", "surrogates not allowed",
4712 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4713 &exc, startpos, startpos+1, &newpos);
4714 if (!rep)
4715 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004717 if (PyBytes_Check(rep))
4718 repsize = PyBytes_GET_SIZE(rep);
4719 else
4720 repsize = PyUnicode_GET_SIZE(rep);
4721
4722 if (repsize > 4) {
4723 Py_ssize_t offset;
4724
4725 if (result == NULL)
4726 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004727 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004730 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4731 /* integer overflow */
4732 PyErr_NoMemory();
4733 goto error;
4734 }
4735 nallocated += repsize - 4;
4736 if (result != NULL) {
4737 if (_PyBytes_Resize(&result, nallocated) < 0)
4738 goto error;
4739 } else {
4740 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004741 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004742 goto error;
4743 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4744 }
4745 p = PyBytes_AS_STRING(result) + offset;
4746 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004748 if (PyBytes_Check(rep)) {
4749 char *prep = PyBytes_AS_STRING(rep);
4750 for(k = repsize; k > 0; k--)
4751 *p++ = *prep++;
4752 } else /* rep is unicode */ {
4753 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4754 Py_UNICODE c;
4755
4756 for(k=0; k<repsize; k++) {
4757 c = prep[k];
4758 if (0x80 <= c) {
4759 raise_encode_exception(&exc, "utf-8",
4760 PyUnicode_AS_UNICODE(unicode),
4761 size, i-1, i,
4762 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004763 goto error;
4764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004766 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004769 } else if (ch < 0x10000) {
4770 *p++ = (char)(0xe0 | (ch >> 12));
4771 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4772 *p++ = (char)(0x80 | (ch & 0x3f));
4773 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004774 /* Encode UCS4 Unicode ordinals */
4775 *p++ = (char)(0xf0 | (ch >> 18));
4776 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4777 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4778 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004779#if SIZEOF_WCHAR_T == 2
4780 wchar_offset++;
4781#endif
Tim Peters602f7402002-04-27 18:03:26 +00004782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004784
Guido van Rossum98297ee2007-11-06 21:34:58 +00004785 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004786 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004787 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004788 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004789 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004790 }
4791 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004792 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004793 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004794 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004795 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004798 Py_XDECREF(errorHandler);
4799 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004800 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004801 error:
4802 Py_XDECREF(errorHandler);
4803 Py_XDECREF(exc);
4804 Py_XDECREF(result);
4805 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004806
Tim Peters602f7402002-04-27 18:03:26 +00004807#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808}
4809
Alexander Belopolsky40018472011-02-26 01:02:56 +00004810PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4812 Py_ssize_t size,
4813 const char *errors)
4814{
4815 PyObject *v, *unicode;
4816
4817 unicode = PyUnicode_FromUnicode(s, size);
4818 if (unicode == NULL)
4819 return NULL;
4820 v = _PyUnicode_AsUTF8String(unicode, errors);
4821 Py_DECREF(unicode);
4822 return v;
4823}
4824
4825PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004826PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829}
4830
Walter Dörwald41980ca2007-08-16 21:55:45 +00004831/* --- UTF-32 Codec ------------------------------------------------------- */
4832
4833PyObject *
4834PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 Py_ssize_t size,
4836 const char *errors,
4837 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004838{
4839 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4840}
4841
4842PyObject *
4843PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 Py_ssize_t size,
4845 const char *errors,
4846 int *byteorder,
4847 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848{
4849 const char *starts = s;
4850 Py_ssize_t startinpos;
4851 Py_ssize_t endinpos;
4852 Py_ssize_t outpos;
4853 PyUnicodeObject *unicode;
4854 Py_UNICODE *p;
4855#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004856 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004857 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004858#else
4859 const int pairs = 0;
4860#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004861 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004862 int bo = 0; /* assume native ordering by default */
4863 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864 /* Offsets from q for retrieving bytes in the right order. */
4865#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4866 int iorder[] = {0, 1, 2, 3};
4867#else
4868 int iorder[] = {3, 2, 1, 0};
4869#endif
4870 PyObject *errorHandler = NULL;
4871 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004872
Walter Dörwald41980ca2007-08-16 21:55:45 +00004873 q = (unsigned char *)s;
4874 e = q + size;
4875
4876 if (byteorder)
4877 bo = *byteorder;
4878
4879 /* Check for BOM marks (U+FEFF) in the input and adjust current
4880 byte order setting accordingly. In native mode, the leading BOM
4881 mark is skipped, in all other modes, it is copied to the output
4882 stream as-is (giving a ZWNBSP character). */
4883 if (bo == 0) {
4884 if (size >= 4) {
4885 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004887#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (bom == 0x0000FEFF) {
4889 q += 4;
4890 bo = -1;
4891 }
4892 else if (bom == 0xFFFE0000) {
4893 q += 4;
4894 bo = 1;
4895 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004896#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 if (bom == 0x0000FEFF) {
4898 q += 4;
4899 bo = 1;
4900 }
4901 else if (bom == 0xFFFE0000) {
4902 q += 4;
4903 bo = -1;
4904 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004907 }
4908
4909 if (bo == -1) {
4910 /* force LE */
4911 iorder[0] = 0;
4912 iorder[1] = 1;
4913 iorder[2] = 2;
4914 iorder[3] = 3;
4915 }
4916 else if (bo == 1) {
4917 /* force BE */
4918 iorder[0] = 3;
4919 iorder[1] = 2;
4920 iorder[2] = 1;
4921 iorder[3] = 0;
4922 }
4923
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004924 /* On narrow builds we split characters outside the BMP into two
4925 codepoints => count how much extra space we need. */
4926#ifndef Py_UNICODE_WIDE
4927 for (qq = q; qq < e; qq += 4)
4928 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4929 pairs++;
4930#endif
4931
4932 /* This might be one to much, because of a BOM */
4933 unicode = _PyUnicode_New((size+3)/4+pairs);
4934 if (!unicode)
4935 return NULL;
4936 if (size == 0)
4937 return (PyObject *)unicode;
4938
4939 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004940 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004941
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 Py_UCS4 ch;
4944 /* remaining bytes at the end? (size should be divisible by 4) */
4945 if (e-q<4) {
4946 if (consumed)
4947 break;
4948 errmsg = "truncated data";
4949 startinpos = ((const char *)q)-starts;
4950 endinpos = ((const char *)e)-starts;
4951 goto utf32Error;
4952 /* The remaining input chars are ignored if the callback
4953 chooses to skip the input */
4954 }
4955 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4956 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 if (ch >= 0x110000)
4959 {
4960 errmsg = "codepoint not in range(0x110000)";
4961 startinpos = ((const char *)q)-starts;
4962 endinpos = startinpos+4;
4963 goto utf32Error;
4964 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004965#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 if (ch >= 0x10000)
4967 {
4968 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4969 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4970 }
4971 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 *p++ = ch;
4974 q += 4;
4975 continue;
4976 utf32Error:
4977 outpos = p-PyUnicode_AS_UNICODE(unicode);
4978 if (unicode_decode_call_errorhandler(
4979 errors, &errorHandler,
4980 "utf32", errmsg,
4981 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4982 &unicode, &outpos, &p))
4983 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
4986 if (byteorder)
4987 *byteorder = bo;
4988
4989 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991
4992 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004993 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994 goto onError;
4995
4996 Py_XDECREF(errorHandler);
4997 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004998#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004999 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005000 Py_DECREF(unicode);
5001 return NULL;
5002 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005003#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005004 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005 return (PyObject *)unicode;
5006
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008 Py_DECREF(unicode);
5009 Py_XDECREF(errorHandler);
5010 Py_XDECREF(exc);
5011 return NULL;
5012}
5013
5014PyObject *
5015PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 Py_ssize_t size,
5017 const char *errors,
5018 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005019{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005020 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005021 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005022 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005023#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005024 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005025#else
5026 const int pairs = 0;
5027#endif
5028 /* Offsets from p for storing byte pairs in the right order. */
5029#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5030 int iorder[] = {0, 1, 2, 3};
5031#else
5032 int iorder[] = {3, 2, 1, 0};
5033#endif
5034
Benjamin Peterson29060642009-01-31 22:14:21 +00005035#define STORECHAR(CH) \
5036 do { \
5037 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5038 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5039 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5040 p[iorder[0]] = (CH) & 0xff; \
5041 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005042 } while(0)
5043
5044 /* In narrow builds we can output surrogate pairs as one codepoint,
5045 so we need less space. */
5046#ifndef Py_UNICODE_WIDE
5047 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5049 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5050 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005052 nsize = (size - pairs + (byteorder == 0));
5053 bytesize = nsize * 4;
5054 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005056 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 if (v == NULL)
5058 return NULL;
5059
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005060 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005064 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065
5066 if (byteorder == -1) {
5067 /* force LE */
5068 iorder[0] = 0;
5069 iorder[1] = 1;
5070 iorder[2] = 2;
5071 iorder[3] = 3;
5072 }
5073 else if (byteorder == 1) {
5074 /* force BE */
5075 iorder[0] = 3;
5076 iorder[1] = 2;
5077 iorder[2] = 1;
5078 iorder[3] = 0;
5079 }
5080
5081 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5085 Py_UCS4 ch2 = *s;
5086 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5087 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5088 s++;
5089 size--;
5090 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005091 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092#endif
5093 STORECHAR(ch);
5094 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005095
5096 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098#undef STORECHAR
5099}
5100
Alexander Belopolsky40018472011-02-26 01:02:56 +00005101PyObject *
5102PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103{
5104 if (!PyUnicode_Check(unicode)) {
5105 PyErr_BadArgument();
5106 return NULL;
5107 }
5108 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 PyUnicode_GET_SIZE(unicode),
5110 NULL,
5111 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112}
5113
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114/* --- UTF-16 Codec ------------------------------------------------------- */
5115
Tim Peters772747b2001-08-09 22:21:55 +00005116PyObject *
5117PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 Py_ssize_t size,
5119 const char *errors,
5120 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121{
Walter Dörwald69652032004-09-07 20:24:22 +00005122 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5123}
5124
Antoine Pitrouab868312009-01-10 15:40:25 +00005125/* Two masks for fast checking of whether a C 'long' may contain
5126 UTF16-encoded surrogate characters. This is an efficient heuristic,
5127 assuming that non-surrogate characters with a code point >= 0x8000 are
5128 rare in most input.
5129 FAST_CHAR_MASK is used when the input is in native byte ordering,
5130 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005131*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005132#if (SIZEOF_LONG == 8)
5133# define FAST_CHAR_MASK 0x8000800080008000L
5134# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5135#elif (SIZEOF_LONG == 4)
5136# define FAST_CHAR_MASK 0x80008000L
5137# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5138#else
5139# error C 'long' size should be either 4 or 8!
5140#endif
5141
Walter Dörwald69652032004-09-07 20:24:22 +00005142PyObject *
5143PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 Py_ssize_t size,
5145 const char *errors,
5146 int *byteorder,
5147 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005148{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005150 Py_ssize_t startinpos;
5151 Py_ssize_t endinpos;
5152 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 PyUnicodeObject *unicode;
5154 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005155 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005156 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005157 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005158 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005159 /* Offsets from q for retrieving byte pairs in the right order. */
5160#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5161 int ihi = 1, ilo = 0;
5162#else
5163 int ihi = 0, ilo = 1;
5164#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165 PyObject *errorHandler = NULL;
5166 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167
5168 /* Note: size will always be longer than the resulting Unicode
5169 character count */
5170 unicode = _PyUnicode_New(size);
5171 if (!unicode)
5172 return NULL;
5173 if (size == 0)
5174 return (PyObject *)unicode;
5175
5176 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005178 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005179 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180
5181 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005182 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005184 /* Check for BOM marks (U+FEFF) in the input and adjust current
5185 byte order setting accordingly. In native mode, the leading BOM
5186 mark is skipped, in all other modes, it is copied to the output
5187 stream as-is (giving a ZWNBSP character). */
5188 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005189 if (size >= 2) {
5190 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005191#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 if (bom == 0xFEFF) {
5193 q += 2;
5194 bo = -1;
5195 }
5196 else if (bom == 0xFFFE) {
5197 q += 2;
5198 bo = 1;
5199 }
Tim Petersced69f82003-09-16 20:30:58 +00005200#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 if (bom == 0xFEFF) {
5202 q += 2;
5203 bo = 1;
5204 }
5205 else if (bom == 0xFFFE) {
5206 q += 2;
5207 bo = -1;
5208 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005209#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
Tim Peters772747b2001-08-09 22:21:55 +00005213 if (bo == -1) {
5214 /* force LE */
5215 ihi = 1;
5216 ilo = 0;
5217 }
5218 else if (bo == 1) {
5219 /* force BE */
5220 ihi = 0;
5221 ilo = 1;
5222 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005223#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5224 native_ordering = ilo < ihi;
5225#else
5226 native_ordering = ilo > ihi;
5227#endif
Tim Peters772747b2001-08-09 22:21:55 +00005228
Antoine Pitrouab868312009-01-10 15:40:25 +00005229 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005230 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005232 /* First check for possible aligned read of a C 'long'. Unaligned
5233 reads are more expensive, better to defer to another iteration. */
5234 if (!((size_t) q & LONG_PTR_MASK)) {
5235 /* Fast path for runs of non-surrogate chars. */
5236 register const unsigned char *_q = q;
5237 Py_UNICODE *_p = p;
5238 if (native_ordering) {
5239 /* Native ordering is simple: as long as the input cannot
5240 possibly contain a surrogate char, do an unrolled copy
5241 of several 16-bit code points to the target object.
5242 The non-surrogate check is done on several input bytes
5243 at a time (as many as a C 'long' can contain). */
5244 while (_q < aligned_end) {
5245 unsigned long data = * (unsigned long *) _q;
5246 if (data & FAST_CHAR_MASK)
5247 break;
5248 _p[0] = ((unsigned short *) _q)[0];
5249 _p[1] = ((unsigned short *) _q)[1];
5250#if (SIZEOF_LONG == 8)
5251 _p[2] = ((unsigned short *) _q)[2];
5252 _p[3] = ((unsigned short *) _q)[3];
5253#endif
5254 _q += SIZEOF_LONG;
5255 _p += SIZEOF_LONG / 2;
5256 }
5257 }
5258 else {
5259 /* Byteswapped ordering is similar, but we must decompose
5260 the copy bytewise, and take care of zero'ing out the
5261 upper bytes if the target object is in 32-bit units
5262 (that is, in UCS-4 builds). */
5263 while (_q < aligned_end) {
5264 unsigned long data = * (unsigned long *) _q;
5265 if (data & SWAPPED_FAST_CHAR_MASK)
5266 break;
5267 /* Zero upper bytes in UCS-4 builds */
5268#if (Py_UNICODE_SIZE > 2)
5269 _p[0] = 0;
5270 _p[1] = 0;
5271#if (SIZEOF_LONG == 8)
5272 _p[2] = 0;
5273 _p[3] = 0;
5274#endif
5275#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005276 /* Issue #4916; UCS-4 builds on big endian machines must
5277 fill the two last bytes of each 4-byte unit. */
5278#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5279# define OFF 2
5280#else
5281# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005282#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005283 ((unsigned char *) _p)[OFF + 1] = _q[0];
5284 ((unsigned char *) _p)[OFF + 0] = _q[1];
5285 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5286 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5287#if (SIZEOF_LONG == 8)
5288 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5289 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5290 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5291 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5292#endif
5293#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005294 _q += SIZEOF_LONG;
5295 _p += SIZEOF_LONG / 2;
5296 }
5297 }
5298 p = _p;
5299 q = _q;
5300 if (q >= e)
5301 break;
5302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304
Benjamin Peterson14339b62009-01-31 16:36:08 +00005305 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005306
5307 if (ch < 0xD800 || ch > 0xDFFF) {
5308 *p++ = ch;
5309 continue;
5310 }
5311
5312 /* UTF-16 code pair: */
5313 if (q > e) {
5314 errmsg = "unexpected end of data";
5315 startinpos = (((const char *)q) - 2) - starts;
5316 endinpos = ((const char *)e) + 1 - starts;
5317 goto utf16Error;
5318 }
5319 if (0xD800 <= ch && ch <= 0xDBFF) {
5320 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5321 q += 2;
5322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005323#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 *p++ = ch;
5325 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005326#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005328#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 continue;
5330 }
5331 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005332 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 startinpos = (((const char *)q)-4)-starts;
5334 endinpos = startinpos+2;
5335 goto utf16Error;
5336 }
5337
Benjamin Peterson14339b62009-01-31 16:36:08 +00005338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 errmsg = "illegal encoding";
5340 startinpos = (((const char *)q)-2)-starts;
5341 endinpos = startinpos+2;
5342 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005343
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 utf16Error:
5345 outpos = p - PyUnicode_AS_UNICODE(unicode);
5346 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005347 errors,
5348 &errorHandler,
5349 "utf16", errmsg,
5350 &starts,
5351 (const char **)&e,
5352 &startinpos,
5353 &endinpos,
5354 &exc,
5355 (const char **)&q,
5356 &unicode,
5357 &outpos,
5358 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005361 /* remaining byte at the end? (size should be even) */
5362 if (e == q) {
5363 if (!consumed) {
5364 errmsg = "truncated data";
5365 startinpos = ((const char *)q) - starts;
5366 endinpos = ((const char *)e) + 1 - starts;
5367 outpos = p - PyUnicode_AS_UNICODE(unicode);
5368 if (unicode_decode_call_errorhandler(
5369 errors,
5370 &errorHandler,
5371 "utf16", errmsg,
5372 &starts,
5373 (const char **)&e,
5374 &startinpos,
5375 &endinpos,
5376 &exc,
5377 (const char **)&q,
5378 &unicode,
5379 &outpos,
5380 &p))
5381 goto onError;
5382 /* The remaining input chars are ignored if the callback
5383 chooses to skip the input */
5384 }
5385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386
5387 if (byteorder)
5388 *byteorder = bo;
5389
Walter Dörwald69652032004-09-07 20:24:22 +00005390 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005394 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 goto onError;
5396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 Py_XDECREF(errorHandler);
5398 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005399#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005400 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005401 Py_DECREF(unicode);
5402 return NULL;
5403 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005404#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005405 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 return (PyObject *)unicode;
5407
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 Py_XDECREF(errorHandler);
5411 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 return NULL;
5413}
5414
Antoine Pitrouab868312009-01-10 15:40:25 +00005415#undef FAST_CHAR_MASK
5416#undef SWAPPED_FAST_CHAR_MASK
5417
Tim Peters772747b2001-08-09 22:21:55 +00005418PyObject *
5419PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 Py_ssize_t size,
5421 const char *errors,
5422 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005424 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005425 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005426 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005427#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005428 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005429#else
5430 const int pairs = 0;
5431#endif
Tim Peters772747b2001-08-09 22:21:55 +00005432 /* Offsets from p for storing byte pairs in the right order. */
5433#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5434 int ihi = 1, ilo = 0;
5435#else
5436 int ihi = 0, ilo = 1;
5437#endif
5438
Benjamin Peterson29060642009-01-31 22:14:21 +00005439#define STORECHAR(CH) \
5440 do { \
5441 p[ihi] = ((CH) >> 8) & 0xff; \
5442 p[ilo] = (CH) & 0xff; \
5443 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005444 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005446#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005447 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 if (s[i] >= 0x10000)
5449 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005450#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005451 /* 2 * (size + pairs + (byteorder == 0)) */
5452 if (size > PY_SSIZE_T_MAX ||
5453 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005455 nsize = size + pairs + (byteorder == 0);
5456 bytesize = nsize * 2;
5457 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005459 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 if (v == NULL)
5461 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005463 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005466 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005467 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005468
5469 if (byteorder == -1) {
5470 /* force LE */
5471 ihi = 1;
5472 ilo = 0;
5473 }
5474 else if (byteorder == 1) {
5475 /* force BE */
5476 ihi = 0;
5477 ilo = 1;
5478 }
5479
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005480 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 Py_UNICODE ch = *s++;
5482 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005483#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 if (ch >= 0x10000) {
5485 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5486 ch = 0xD800 | ((ch-0x10000) >> 10);
5487 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005488#endif
Tim Peters772747b2001-08-09 22:21:55 +00005489 STORECHAR(ch);
5490 if (ch2)
5491 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005492 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005493
5494 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005495 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005496#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497}
5498
Alexander Belopolsky40018472011-02-26 01:02:56 +00005499PyObject *
5500PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501{
5502 if (!PyUnicode_Check(unicode)) {
5503 PyErr_BadArgument();
5504 return NULL;
5505 }
5506 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 PyUnicode_GET_SIZE(unicode),
5508 NULL,
5509 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510}
5511
5512/* --- Unicode Escape Codec ----------------------------------------------- */
5513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005514/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5515 if all the escapes in the string make it still a valid ASCII string.
5516 Returns -1 if any escapes were found which cause the string to
5517 pop out of ASCII range. Otherwise returns the length of the
5518 required buffer to hold the string.
5519 */
5520Py_ssize_t
5521length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5522{
5523 const unsigned char *p = (const unsigned char *)s;
5524 const unsigned char *end = p + size;
5525 Py_ssize_t length = 0;
5526
5527 if (size < 0)
5528 return -1;
5529
5530 for (; p < end; ++p) {
5531 if (*p > 127) {
5532 /* Non-ASCII */
5533 return -1;
5534 }
5535 else if (*p != '\\') {
5536 /* Normal character */
5537 ++length;
5538 }
5539 else {
5540 /* Backslash-escape, check next char */
5541 ++p;
5542 /* Escape sequence reaches till end of string or
5543 non-ASCII follow-up. */
5544 if (p >= end || *p > 127)
5545 return -1;
5546 switch (*p) {
5547 case '\n':
5548 /* backslash + \n result in zero characters */
5549 break;
5550 case '\\': case '\'': case '\"':
5551 case 'b': case 'f': case 't':
5552 case 'n': case 'r': case 'v': case 'a':
5553 ++length;
5554 break;
5555 case '0': case '1': case '2': case '3':
5556 case '4': case '5': case '6': case '7':
5557 case 'x': case 'u': case 'U': case 'N':
5558 /* these do not guarantee ASCII characters */
5559 return -1;
5560 default:
5561 /* count the backslash + the other character */
5562 length += 2;
5563 }
5564 }
5565 }
5566 return length;
5567}
5568
5569/* Similar to PyUnicode_WRITE but either write into wstr field
5570 or treat string as ASCII. */
5571#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5572 do { \
5573 if ((kind) != PyUnicode_WCHAR_KIND) \
5574 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5575 else \
5576 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5577 } while (0)
5578
5579#define WRITE_WSTR(buf, index, value) \
5580 assert(kind == PyUnicode_WCHAR_KIND), \
5581 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5582
5583
Fredrik Lundh06d12682001-01-24 07:59:11 +00005584static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005585
Alexander Belopolsky40018472011-02-26 01:02:56 +00005586PyObject *
5587PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005588 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005592 Py_ssize_t startinpos;
5593 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005594 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005598 char* message;
5599 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 PyObject *errorHandler = NULL;
5601 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602 Py_ssize_t ascii_length;
5603 Py_ssize_t i;
5604 int kind;
5605 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005607 ascii_length = length_of_escaped_ascii_string(s, size);
5608
5609 /* After length_of_escaped_ascii_string() there are two alternatives,
5610 either the string is pure ASCII with named escapes like \n, etc.
5611 and we determined it's exact size (common case)
5612 or it contains \x, \u, ... escape sequences. then we create a
5613 legacy wchar string and resize it at the end of this function. */
5614 if (ascii_length >= 0) {
5615 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5616 if (!v)
5617 goto onError;
5618 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5619 kind = PyUnicode_1BYTE_KIND;
5620 data = PyUnicode_DATA(v);
5621 }
5622 else {
5623 /* Escaped strings will always be longer than the resulting
5624 Unicode string, so we start with size here and then reduce the
5625 length after conversion to the true value.
5626 (but if the error callback returns a long replacement string
5627 we'll have to allocate more space) */
5628 v = _PyUnicode_New(size);
5629 if (!v)
5630 goto onError;
5631 kind = PyUnicode_WCHAR_KIND;
5632 data = PyUnicode_AS_UNICODE(v);
5633 }
5634
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 if (size == 0)
5636 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005637 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005639
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 while (s < end) {
5641 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005642 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 if (kind == PyUnicode_WCHAR_KIND) {
5646 assert(i < _PyUnicode_WSTR_LENGTH(v));
5647 }
5648 else {
5649 /* The only case in which i == ascii_length is a backslash
5650 followed by a newline. */
5651 assert(i <= ascii_length);
5652 }
5653
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 /* Non-escape characters are interpreted as Unicode ordinals */
5655 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 continue;
5658 }
5659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 /* \ - Escapes */
5662 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005663 c = *s++;
5664 if (s > end)
5665 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005666
5667 if (kind == PyUnicode_WCHAR_KIND) {
5668 assert(i < _PyUnicode_WSTR_LENGTH(v));
5669 }
5670 else {
5671 /* The only case in which i == ascii_length is a backslash
5672 followed by a newline. */
5673 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5674 }
5675
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005676 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005680 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5681 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5682 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5683 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5684 /* FF */
5685 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5686 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5687 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5688 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5689 /* VT */
5690 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5691 /* BEL, not classic C */
5692 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 case '0': case '1': case '2': case '3':
5696 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005697 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005698 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005699 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005700 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005701 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 break;
5705
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 /* hex escapes */
5707 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005709 digits = 2;
5710 message = "truncated \\xXX escape";
5711 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005715 digits = 4;
5716 message = "truncated \\uXXXX escape";
5717 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005720 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005721 digits = 8;
5722 message = "truncated \\UXXXXXXXX escape";
5723 hexescape:
5724 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005725 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 if (s+digits>end) {
5727 endinpos = size;
5728 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 errors, &errorHandler,
5730 "unicodeescape", "end of string in escape sequence",
5731 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005732 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005734 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 goto nextByte;
5736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737 for (j = 0; j < digits; ++j) {
5738 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005739 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740 endinpos = (s+j+1)-starts;
5741 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 errors, &errorHandler,
5744 "unicodeescape", message,
5745 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005746 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005747 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005748 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005750 }
5751 chr = (chr<<4) & ~0xF;
5752 if (c >= '0' && c <= '9')
5753 chr += c - '0';
5754 else if (c >= 'a' && c <= 'f')
5755 chr += 10 + c - 'a';
5756 else
5757 chr += 10 + c - 'A';
5758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005759 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005760 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 /* _decoding_error will have already written into the
5762 target buffer. */
5763 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005765 /* when we get here, chr is a 32-bit unicode character */
5766 if (chr <= 0xffff)
5767 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005768 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005769 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005770 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005771 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005772#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005773 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005774#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005775 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005776 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5777 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005778#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005779 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 errors, &errorHandler,
5784 "unicodeescape", "illegal Unicode character",
5785 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005787 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005789 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005790 break;
5791
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005793 case 'N':
5794 message = "malformed \\N character escape";
5795 if (ucnhash_CAPI == NULL) {
5796 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5798 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005799 if (ucnhash_CAPI == NULL)
5800 goto ucnhashError;
5801 }
5802 if (*s == '{') {
5803 const char *start = s+1;
5804 /* look for the closing brace */
5805 while (*s != '}' && s < end)
5806 s++;
5807 if (s > start && s < end && *s == '}') {
5808 /* found a name. look it up in the unicode database */
5809 message = "unknown Unicode character name";
5810 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005811 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5812 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 goto store;
5814 }
5815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005817 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 errors, &errorHandler,
5820 "unicodeescape", message,
5821 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005822 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005823 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005825 break;
5826
5827 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005828 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005829 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 message = "\\ at end of string";
5831 s--;
5832 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005833 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 errors, &errorHandler,
5836 "unicodeescape", message,
5837 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005839 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005840 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005841 }
5842 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005843 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5844 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005845 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005846 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005851 /* Ensure the length prediction worked in case of ASCII strings */
5852 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5853
Victor Stinnerfe226c02011-10-03 03:52:20 +02005854 if (kind == PyUnicode_WCHAR_KIND)
5855 {
5856 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5857 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005858 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005861#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005862 if (_PyUnicode_READY_REPLACE(&v)) {
5863 Py_DECREF(v);
5864 return NULL;
5865 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005866#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005867 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005871 PyErr_SetString(
5872 PyExc_UnicodeError,
5873 "\\N escapes not supported (can't load unicodedata module)"
5874 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005875 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005878 return NULL;
5879
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 Py_XDECREF(errorHandler);
5883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 return NULL;
5885}
5886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887#undef WRITE_ASCII_OR_WSTR
5888#undef WRITE_WSTR
5889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890/* Return a Unicode-Escape string version of the Unicode object.
5891
5892 If quotes is true, the string is enclosed in u"" or u'' quotes as
5893 appropriate.
5894
5895*/
5896
Walter Dörwald79e913e2007-05-12 11:08:06 +00005897static const char *hexdigits = "0123456789abcdef";
5898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
5900PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005901 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005903 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005906#ifdef Py_UNICODE_WIDE
5907 const Py_ssize_t expandsize = 10;
5908#else
5909 const Py_ssize_t expandsize = 6;
5910#endif
5911
Thomas Wouters89f507f2006-12-13 04:49:30 +00005912 /* XXX(nnorwitz): rather than over-allocating, it would be
5913 better to choose a different scheme. Perhaps scan the
5914 first N-chars of the string and allocate based on that size.
5915 */
5916 /* Initial allocation is based on the longest-possible unichr
5917 escape.
5918
5919 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5920 unichr, so in this case it's the longest unichr escape. In
5921 narrow (UTF-16) builds this is five chars per source unichr
5922 since there are two unichrs in the surrogate pair, so in narrow
5923 (UTF-16) builds it's not the longest unichr escape.
5924
5925 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5926 so in the narrow (UTF-16) build case it's the longest unichr
5927 escape.
5928 */
5929
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005930 if (size == 0)
5931 return PyBytes_FromStringAndSize(NULL, 0);
5932
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005933 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005935
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005936 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 2
5938 + expandsize*size
5939 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 if (repr == NULL)
5941 return NULL;
5942
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005943 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 while (size-- > 0) {
5946 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005947
Walter Dörwald79e913e2007-05-12 11:08:06 +00005948 /* Escape backslashes */
5949 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 *p++ = '\\';
5951 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005952 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005953 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005954
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005955#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005956 /* Map 21-bit characters to '\U00xxxxxx' */
5957 else if (ch >= 0x10000) {
5958 *p++ = '\\';
5959 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005960 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5961 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5962 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5963 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5964 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5965 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5966 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5967 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005969 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005970#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5972 else if (ch >= 0xD800 && ch < 0xDC00) {
5973 Py_UNICODE ch2;
5974 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005975
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 ch2 = *s++;
5977 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005978 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5980 *p++ = '\\';
5981 *p++ = 'U';
5982 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5983 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5984 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5985 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5986 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5987 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5988 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5989 *p++ = hexdigits[ucs & 0x0000000F];
5990 continue;
5991 }
5992 /* Fall through: isolated surrogates are copied as-is */
5993 s--;
5994 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005995 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005996#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005999 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 *p++ = '\\';
6001 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006002 *p++ = hexdigits[(ch >> 12) & 0x000F];
6003 *p++ = hexdigits[(ch >> 8) & 0x000F];
6004 *p++ = hexdigits[(ch >> 4) & 0x000F];
6005 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006007
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006008 /* Map special whitespace to '\t', \n', '\r' */
6009 else if (ch == '\t') {
6010 *p++ = '\\';
6011 *p++ = 't';
6012 }
6013 else if (ch == '\n') {
6014 *p++ = '\\';
6015 *p++ = 'n';
6016 }
6017 else if (ch == '\r') {
6018 *p++ = '\\';
6019 *p++ = 'r';
6020 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006021
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006022 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006023 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006025 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006026 *p++ = hexdigits[(ch >> 4) & 0x000F];
6027 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006028 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* Copy everything else as-is */
6031 else
6032 *p++ = (char) ch;
6033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 assert(p - PyBytes_AS_STRING(repr) > 0);
6036 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6037 return NULL;
6038 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039}
6040
Alexander Belopolsky40018472011-02-26 01:02:56 +00006041PyObject *
6042PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006044 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 if (!PyUnicode_Check(unicode)) {
6046 PyErr_BadArgument();
6047 return NULL;
6048 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006049 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6050 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006051 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052}
6053
6054/* --- Raw Unicode Escape Codec ------------------------------------------- */
6055
Alexander Belopolsky40018472011-02-26 01:02:56 +00006056PyObject *
6057PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006058 Py_ssize_t size,
6059 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006062 Py_ssize_t startinpos;
6063 Py_ssize_t endinpos;
6064 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 const char *end;
6068 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 PyObject *errorHandler = NULL;
6070 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 /* Escaped strings will always be longer than the resulting
6073 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 length after conversion to the true value. (But decoding error
6075 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 v = _PyUnicode_New(size);
6077 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 end = s + size;
6083 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 unsigned char c;
6085 Py_UCS4 x;
6086 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006087 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 /* Non-escape characters are interpreted as Unicode ordinals */
6090 if (*s != '\\') {
6091 *p++ = (unsigned char)*s++;
6092 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 startinpos = s-starts;
6095
6096 /* \u-escapes are only interpreted iff the number of leading
6097 backslashes if odd */
6098 bs = s;
6099 for (;s < end;) {
6100 if (*s != '\\')
6101 break;
6102 *p++ = (unsigned char)*s++;
6103 }
6104 if (((s - bs) & 1) == 0 ||
6105 s >= end ||
6106 (*s != 'u' && *s != 'U')) {
6107 continue;
6108 }
6109 p--;
6110 count = *s=='u' ? 4 : 8;
6111 s++;
6112
6113 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6114 outpos = p-PyUnicode_AS_UNICODE(v);
6115 for (x = 0, i = 0; i < count; ++i, ++s) {
6116 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006117 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 endinpos = s-starts;
6119 if (unicode_decode_call_errorhandler(
6120 errors, &errorHandler,
6121 "rawunicodeescape", "truncated \\uXXXX",
6122 &starts, &end, &startinpos, &endinpos, &exc, &s,
6123 &v, &outpos, &p))
6124 goto onError;
6125 goto nextByte;
6126 }
6127 x = (x<<4) & ~0xF;
6128 if (c >= '0' && c <= '9')
6129 x += c - '0';
6130 else if (c >= 'a' && c <= 'f')
6131 x += 10 + c - 'a';
6132 else
6133 x += 10 + c - 'A';
6134 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006135 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 /* UCS-2 character */
6137 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006138 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 /* UCS-4 character. Either store directly, or as
6140 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006141#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006143#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 x -= 0x10000L;
6145 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6146 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006147#endif
6148 } else {
6149 endinpos = s-starts;
6150 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006151 if (unicode_decode_call_errorhandler(
6152 errors, &errorHandler,
6153 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 &starts, &end, &startinpos, &endinpos, &exc, &s,
6155 &v, &outpos, &p))
6156 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006157 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 nextByte:
6159 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006161 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 Py_XDECREF(errorHandler);
6164 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006165#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006166 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006167 Py_DECREF(v);
6168 return NULL;
6169 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006170#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006171 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006173
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 Py_XDECREF(errorHandler);
6177 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 return NULL;
6179}
6180
Alexander Belopolsky40018472011-02-26 01:02:56 +00006181PyObject *
6182PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006183 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006185 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 char *p;
6187 char *q;
6188
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006189#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006190 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006191#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006192 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006193#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006194
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006195 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006197
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 if (repr == NULL)
6200 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006201 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006202 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006204 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 while (size-- > 0) {
6206 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006207#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 /* Map 32-bit characters to '\Uxxxxxxxx' */
6209 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006210 *p++ = '\\';
6211 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006212 *p++ = hexdigits[(ch >> 28) & 0xf];
6213 *p++ = hexdigits[(ch >> 24) & 0xf];
6214 *p++ = hexdigits[(ch >> 20) & 0xf];
6215 *p++ = hexdigits[(ch >> 16) & 0xf];
6216 *p++ = hexdigits[(ch >> 12) & 0xf];
6217 *p++ = hexdigits[(ch >> 8) & 0xf];
6218 *p++ = hexdigits[(ch >> 4) & 0xf];
6219 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006220 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006221 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006222#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6224 if (ch >= 0xD800 && ch < 0xDC00) {
6225 Py_UNICODE ch2;
6226 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006227
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 ch2 = *s++;
6229 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006230 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6232 *p++ = '\\';
6233 *p++ = 'U';
6234 *p++ = hexdigits[(ucs >> 28) & 0xf];
6235 *p++ = hexdigits[(ucs >> 24) & 0xf];
6236 *p++ = hexdigits[(ucs >> 20) & 0xf];
6237 *p++ = hexdigits[(ucs >> 16) & 0xf];
6238 *p++ = hexdigits[(ucs >> 12) & 0xf];
6239 *p++ = hexdigits[(ucs >> 8) & 0xf];
6240 *p++ = hexdigits[(ucs >> 4) & 0xf];
6241 *p++ = hexdigits[ucs & 0xf];
6242 continue;
6243 }
6244 /* Fall through: isolated surrogates are copied as-is */
6245 s--;
6246 size++;
6247 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006248#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 /* Map 16-bit characters to '\uxxxx' */
6250 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 *p++ = '\\';
6252 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006253 *p++ = hexdigits[(ch >> 12) & 0xf];
6254 *p++ = hexdigits[(ch >> 8) & 0xf];
6255 *p++ = hexdigits[(ch >> 4) & 0xf];
6256 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 /* Copy everything else as-is */
6259 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 *p++ = (char) ch;
6261 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006262 size = p - q;
6263
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006264 assert(size > 0);
6265 if (_PyBytes_Resize(&repr, size) < 0)
6266 return NULL;
6267 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268}
6269
Alexander Belopolsky40018472011-02-26 01:02:56 +00006270PyObject *
6271PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006273 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006275 PyErr_BadArgument();
6276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006278 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6279 PyUnicode_GET_SIZE(unicode));
6280
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006281 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282}
6283
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006284/* --- Unicode Internal Codec ------------------------------------------- */
6285
Alexander Belopolsky40018472011-02-26 01:02:56 +00006286PyObject *
6287_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006288 Py_ssize_t size,
6289 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290{
6291 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006292 Py_ssize_t startinpos;
6293 Py_ssize_t endinpos;
6294 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006295 PyUnicodeObject *v;
6296 Py_UNICODE *p;
6297 const char *end;
6298 const char *reason;
6299 PyObject *errorHandler = NULL;
6300 PyObject *exc = NULL;
6301
Neal Norwitzd43069c2006-01-08 01:12:10 +00006302#ifdef Py_UNICODE_WIDE
6303 Py_UNICODE unimax = PyUnicode_GetMax();
6304#endif
6305
Thomas Wouters89f507f2006-12-13 04:49:30 +00006306 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006307 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6308 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006310 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6311 as string was created with the old API. */
6312 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006314 p = PyUnicode_AS_UNICODE(v);
6315 end = s + size;
6316
6317 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006318 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319 /* We have to sanity check the raw data, otherwise doom looms for
6320 some malformed UCS-4 data. */
6321 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006322#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006324#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325 end-s < Py_UNICODE_SIZE
6326 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006328 startinpos = s - starts;
6329 if (end-s < Py_UNICODE_SIZE) {
6330 endinpos = end-starts;
6331 reason = "truncated input";
6332 }
6333 else {
6334 endinpos = s - starts + Py_UNICODE_SIZE;
6335 reason = "illegal code point (> 0x10FFFF)";
6336 }
6337 outpos = p - PyUnicode_AS_UNICODE(v);
6338 if (unicode_decode_call_errorhandler(
6339 errors, &errorHandler,
6340 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006341 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006342 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006343 goto onError;
6344 }
6345 }
6346 else {
6347 p++;
6348 s += Py_UNICODE_SIZE;
6349 }
6350 }
6351
Victor Stinnerfe226c02011-10-03 03:52:20 +02006352 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006353 goto onError;
6354 Py_XDECREF(errorHandler);
6355 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006356#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006357 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006358 Py_DECREF(v);
6359 return NULL;
6360 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006361#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006362 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006363 return (PyObject *)v;
6364
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006366 Py_XDECREF(v);
6367 Py_XDECREF(errorHandler);
6368 Py_XDECREF(exc);
6369 return NULL;
6370}
6371
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372/* --- Latin-1 Codec ------------------------------------------------------ */
6373
Alexander Belopolsky40018472011-02-26 01:02:56 +00006374PyObject *
6375PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006376 Py_ssize_t size,
6377 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006380 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381}
6382
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006384static void
6385make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006386 const char *encoding,
6387 const Py_UNICODE *unicode, Py_ssize_t size,
6388 Py_ssize_t startpos, Py_ssize_t endpos,
6389 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 *exceptionObject = PyUnicodeEncodeError_Create(
6393 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 }
6395 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6397 goto onError;
6398 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6399 goto onError;
6400 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6401 goto onError;
6402 return;
6403 onError:
6404 Py_DECREF(*exceptionObject);
6405 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 }
6407}
6408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006410static void
6411raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006412 const char *encoding,
6413 const Py_UNICODE *unicode, Py_ssize_t size,
6414 Py_ssize_t startpos, Py_ssize_t endpos,
6415 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416{
6417 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421}
6422
6423/* error handling callback helper:
6424 build arguments, call the callback and check the arguments,
6425 put the result into newpos and return the replacement string, which
6426 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427static PyObject *
6428unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006429 PyObject **errorHandler,
6430 const char *encoding, const char *reason,
6431 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6432 Py_ssize_t startpos, Py_ssize_t endpos,
6433 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006435 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436
6437 PyObject *restuple;
6438 PyObject *resunicode;
6439
6440 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 }
6445
6446 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450
6451 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006456 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 Py_DECREF(restuple);
6458 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006460 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 &resunicode, newpos)) {
6462 Py_DECREF(restuple);
6463 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006465 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6466 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6467 Py_DECREF(restuple);
6468 return NULL;
6469 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006472 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6474 Py_DECREF(restuple);
6475 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 Py_INCREF(resunicode);
6478 Py_DECREF(restuple);
6479 return resunicode;
6480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482static PyObject *
6483unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006484 Py_ssize_t size,
6485 const char *errors,
6486 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487{
6488 /* output object */
6489 PyObject *res;
6490 /* pointers to the beginning and end+1 of input */
6491 const Py_UNICODE *startp = p;
6492 const Py_UNICODE *endp = p + size;
6493 /* pointer to the beginning of the unencodable characters */
6494 /* const Py_UNICODE *badp = NULL; */
6495 /* pointer into the output */
6496 char *str;
6497 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006498 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006499 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6500 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006501 PyObject *errorHandler = NULL;
6502 PyObject *exc = NULL;
6503 /* the following variable is used for caching string comparisons
6504 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6505 int known_errorHandler = -1;
6506
6507 /* allocate enough for a simple encoding without
6508 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006509 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006510 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006511 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006512 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006513 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006514 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006515 ressize = size;
6516
6517 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 /* can we encode this? */
6521 if (c<limit) {
6522 /* no overflow check, because we know that the space is enough */
6523 *str++ = (char)c;
6524 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006525 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 else {
6527 Py_ssize_t unicodepos = p-startp;
6528 Py_ssize_t requiredsize;
6529 PyObject *repunicode;
6530 Py_ssize_t repsize;
6531 Py_ssize_t newpos;
6532 Py_ssize_t respos;
6533 Py_UNICODE *uni2;
6534 /* startpos for collecting unencodable chars */
6535 const Py_UNICODE *collstart = p;
6536 const Py_UNICODE *collend = p;
6537 /* find all unecodable characters */
6538 while ((collend < endp) && ((*collend)>=limit))
6539 ++collend;
6540 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6541 if (known_errorHandler==-1) {
6542 if ((errors==NULL) || (!strcmp(errors, "strict")))
6543 known_errorHandler = 1;
6544 else if (!strcmp(errors, "replace"))
6545 known_errorHandler = 2;
6546 else if (!strcmp(errors, "ignore"))
6547 known_errorHandler = 3;
6548 else if (!strcmp(errors, "xmlcharrefreplace"))
6549 known_errorHandler = 4;
6550 else
6551 known_errorHandler = 0;
6552 }
6553 switch (known_errorHandler) {
6554 case 1: /* strict */
6555 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6556 goto onError;
6557 case 2: /* replace */
6558 while (collstart++<collend)
6559 *str++ = '?'; /* fall through */
6560 case 3: /* ignore */
6561 p = collend;
6562 break;
6563 case 4: /* xmlcharrefreplace */
6564 respos = str - PyBytes_AS_STRING(res);
6565 /* determine replacement size (temporarily (mis)uses p) */
6566 for (p = collstart, repsize = 0; p < collend; ++p) {
6567 if (*p<10)
6568 repsize += 2+1+1;
6569 else if (*p<100)
6570 repsize += 2+2+1;
6571 else if (*p<1000)
6572 repsize += 2+3+1;
6573 else if (*p<10000)
6574 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006575#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 else
6577 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006578#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 else if (*p<100000)
6580 repsize += 2+5+1;
6581 else if (*p<1000000)
6582 repsize += 2+6+1;
6583 else
6584 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006585#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 }
6587 requiredsize = respos+repsize+(endp-collend);
6588 if (requiredsize > ressize) {
6589 if (requiredsize<2*ressize)
6590 requiredsize = 2*ressize;
6591 if (_PyBytes_Resize(&res, requiredsize))
6592 goto onError;
6593 str = PyBytes_AS_STRING(res) + respos;
6594 ressize = requiredsize;
6595 }
6596 /* generate replacement (temporarily (mis)uses p) */
6597 for (p = collstart; p < collend; ++p) {
6598 str += sprintf(str, "&#%d;", (int)*p);
6599 }
6600 p = collend;
6601 break;
6602 default:
6603 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6604 encoding, reason, startp, size, &exc,
6605 collstart-startp, collend-startp, &newpos);
6606 if (repunicode == NULL)
6607 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006608 if (PyBytes_Check(repunicode)) {
6609 /* Directly copy bytes result to output. */
6610 repsize = PyBytes_Size(repunicode);
6611 if (repsize > 1) {
6612 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006613 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006614 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6615 Py_DECREF(repunicode);
6616 goto onError;
6617 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006618 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006619 ressize += repsize-1;
6620 }
6621 memcpy(str, PyBytes_AsString(repunicode), repsize);
6622 str += repsize;
6623 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006624 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006625 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006626 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 /* need more space? (at least enough for what we
6628 have+the replacement+the rest of the string, so
6629 we won't have to check space for encodable characters) */
6630 respos = str - PyBytes_AS_STRING(res);
6631 repsize = PyUnicode_GET_SIZE(repunicode);
6632 requiredsize = respos+repsize+(endp-collend);
6633 if (requiredsize > ressize) {
6634 if (requiredsize<2*ressize)
6635 requiredsize = 2*ressize;
6636 if (_PyBytes_Resize(&res, requiredsize)) {
6637 Py_DECREF(repunicode);
6638 goto onError;
6639 }
6640 str = PyBytes_AS_STRING(res) + respos;
6641 ressize = requiredsize;
6642 }
6643 /* check if there is anything unencodable in the replacement
6644 and copy it to the output */
6645 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6646 c = *uni2;
6647 if (c >= limit) {
6648 raise_encode_exception(&exc, encoding, startp, size,
6649 unicodepos, unicodepos+1, reason);
6650 Py_DECREF(repunicode);
6651 goto onError;
6652 }
6653 *str = (char)c;
6654 }
6655 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006656 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006657 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006658 }
6659 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006660 /* Resize if we allocated to much */
6661 size = str - PyBytes_AS_STRING(res);
6662 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006663 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006664 if (_PyBytes_Resize(&res, size) < 0)
6665 goto onError;
6666 }
6667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 Py_XDECREF(errorHandler);
6669 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006670 return res;
6671
6672 onError:
6673 Py_XDECREF(res);
6674 Py_XDECREF(errorHandler);
6675 Py_XDECREF(exc);
6676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677}
6678
Alexander Belopolsky40018472011-02-26 01:02:56 +00006679PyObject *
6680PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006681 Py_ssize_t size,
6682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685}
6686
Alexander Belopolsky40018472011-02-26 01:02:56 +00006687PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006688_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
6690 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 PyErr_BadArgument();
6692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006694 if (PyUnicode_READY(unicode) == -1)
6695 return NULL;
6696 /* Fast path: if it is a one-byte string, construct
6697 bytes object directly. */
6698 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6699 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6700 PyUnicode_GET_LENGTH(unicode));
6701 /* Non-Latin-1 characters present. Defer to above function to
6702 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006705 errors);
6706}
6707
6708PyObject*
6709PyUnicode_AsLatin1String(PyObject *unicode)
6710{
6711 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712}
6713
6714/* --- 7-bit ASCII Codec -------------------------------------------------- */
6715
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716PyObject *
6717PyUnicode_DecodeASCII(const char *s,
6718 Py_ssize_t size,
6719 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006723 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006724 Py_ssize_t startinpos;
6725 Py_ssize_t endinpos;
6726 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006728 int has_error;
6729 const unsigned char *p = (const unsigned char *)s;
6730 const unsigned char *end = p + size;
6731 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 PyObject *errorHandler = NULL;
6733 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006736 if (size == 1 && (unsigned char)s[0] < 128)
6737 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738
Victor Stinner702c7342011-10-05 13:50:52 +02006739 has_error = 0;
6740 while (p < end && !has_error) {
6741 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6742 an explanation. */
6743 if (!((size_t) p & LONG_PTR_MASK)) {
6744 /* Help register allocation */
6745 register const unsigned char *_p = p;
6746 while (_p < aligned_end) {
6747 unsigned long value = *(unsigned long *) _p;
6748 if (value & ASCII_CHAR_MASK) {
6749 has_error = 1;
6750 break;
6751 }
6752 _p += SIZEOF_LONG;
6753 }
6754 if (_p == end)
6755 break;
6756 if (has_error)
6757 break;
6758 p = _p;
6759 }
6760 if (*p & 0x80) {
6761 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006762 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006763 }
6764 else {
6765 ++p;
6766 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006767 }
Victor Stinner702c7342011-10-05 13:50:52 +02006768 if (!has_error)
6769 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006770
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 v = _PyUnicode_New(size);
6772 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006776 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 e = s + size;
6778 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 register unsigned char c = (unsigned char)*s;
6780 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006781 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 ++s;
6783 }
6784 else {
6785 startinpos = s-starts;
6786 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006787 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 if (unicode_decode_call_errorhandler(
6789 errors, &errorHandler,
6790 "ascii", "ordinal not in range(128)",
6791 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006792 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 goto onError;
6794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 }
Victor Stinner702c7342011-10-05 13:50:52 +02006796 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6797 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006799 Py_XDECREF(errorHandler);
6800 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006801#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006802 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006803 Py_DECREF(v);
6804 return NULL;
6805 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006806#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006807 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006809
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006812 Py_XDECREF(errorHandler);
6813 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 return NULL;
6815}
6816
Alexander Belopolsky40018472011-02-26 01:02:56 +00006817PyObject *
6818PyUnicode_EncodeASCII(const Py_UNICODE *p,
6819 Py_ssize_t size,
6820 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Alexander Belopolsky40018472011-02-26 01:02:56 +00006825PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006826_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827{
6828 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 PyErr_BadArgument();
6830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006832 if (PyUnicode_READY(unicode) == -1)
6833 return NULL;
6834 /* Fast path: if it is an ASCII-only string, construct bytes object
6835 directly. Else defer to above function to raise the exception. */
6836 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6837 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6838 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006841 errors);
6842}
6843
6844PyObject *
6845PyUnicode_AsASCIIString(PyObject *unicode)
6846{
6847 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848}
6849
Victor Stinner99b95382011-07-04 14:23:54 +02006850#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006851
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006852/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006853
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006854#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855#define NEED_RETRY
6856#endif
6857
6858/* XXX This code is limited to "true" double-byte encodings, as
6859 a) it assumes an incomplete character consists of a single byte, and
6860 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862
Alexander Belopolsky40018472011-02-26 01:02:56 +00006863static int
6864is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006865{
6866 const char *curr = s + offset;
6867
6868 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 const char *prev = CharPrev(s, curr);
6870 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871 }
6872 return 0;
6873}
6874
6875/*
6876 * Decode MBCS string into unicode object. If 'final' is set, converts
6877 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6878 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006879static int
6880decode_mbcs(PyUnicodeObject **v,
6881 const char *s, /* MBCS string */
6882 int size, /* sizeof MBCS string */
6883 int final,
6884 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885{
6886 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006887 Py_ssize_t n;
6888 DWORD usize;
6889 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006890
6891 assert(size >= 0);
6892
Victor Stinner554f3f02010-06-16 23:33:54 +00006893 /* check and handle 'errors' arg */
6894 if (errors==NULL || strcmp(errors, "strict")==0)
6895 flags = MB_ERR_INVALID_CHARS;
6896 else if (strcmp(errors, "ignore")==0)
6897 flags = 0;
6898 else {
6899 PyErr_Format(PyExc_ValueError,
6900 "mbcs encoding does not support errors='%s'",
6901 errors);
6902 return -1;
6903 }
6904
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905 /* Skip trailing lead-byte unless 'final' is set */
6906 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908
6909 /* First get the size of the result */
6910 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006911 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6912 if (usize==0)
6913 goto mbcs_decode_error;
6914 } else
6915 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006916
6917 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 /* Create unicode object */
6919 *v = _PyUnicode_New(usize);
6920 if (*v == NULL)
6921 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006922 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006923 }
6924 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 /* Extend unicode object */
6926 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006927 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006929 }
6930
6931 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006932 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006934 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6935 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006937 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006939
6940mbcs_decode_error:
6941 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6942 we raise a UnicodeDecodeError - else it is a 'generic'
6943 windows error
6944 */
6945 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6946 /* Ideally, we should get reason from FormatMessage - this
6947 is the Windows 2000 English version of the message
6948 */
6949 PyObject *exc = NULL;
6950 const char *reason = "No mapping for the Unicode character exists "
6951 "in the target multi-byte code page.";
6952 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6953 if (exc != NULL) {
6954 PyCodec_StrictErrors(exc);
6955 Py_DECREF(exc);
6956 }
6957 } else {
6958 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6959 }
6960 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961}
6962
Alexander Belopolsky40018472011-02-26 01:02:56 +00006963PyObject *
6964PyUnicode_DecodeMBCSStateful(const char *s,
6965 Py_ssize_t size,
6966 const char *errors,
6967 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968{
6969 PyUnicodeObject *v = NULL;
6970 int done;
6971
6972 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974
6975#ifdef NEED_RETRY
6976 retry:
6977 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006978 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979 else
6980#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006981 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982
6983 if (done < 0) {
6984 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986 }
6987
6988 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006990
6991#ifdef NEED_RETRY
6992 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 s += done;
6994 size -= done;
6995 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996 }
6997#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006998#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006999 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007000 Py_DECREF(v);
7001 return NULL;
7002 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007003#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007004 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007005 return (PyObject *)v;
7006}
7007
Alexander Belopolsky40018472011-02-26 01:02:56 +00007008PyObject *
7009PyUnicode_DecodeMBCS(const char *s,
7010 Py_ssize_t size,
7011 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007012{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007013 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7014}
7015
7016/*
7017 * Convert unicode into string object (MBCS).
7018 * Returns 0 if succeed, -1 otherwise.
7019 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007020static int
7021encode_mbcs(PyObject **repr,
7022 const Py_UNICODE *p, /* unicode */
7023 int size, /* size of unicode */
7024 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007025{
Victor Stinner554f3f02010-06-16 23:33:54 +00007026 BOOL usedDefaultChar = FALSE;
7027 BOOL *pusedDefaultChar;
7028 int mbcssize;
7029 Py_ssize_t n;
7030 PyObject *exc = NULL;
7031 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
7033 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007034
Victor Stinner554f3f02010-06-16 23:33:54 +00007035 /* check and handle 'errors' arg */
7036 if (errors==NULL || strcmp(errors, "strict")==0) {
7037 flags = WC_NO_BEST_FIT_CHARS;
7038 pusedDefaultChar = &usedDefaultChar;
7039 } else if (strcmp(errors, "replace")==0) {
7040 flags = 0;
7041 pusedDefaultChar = NULL;
7042 } else {
7043 PyErr_Format(PyExc_ValueError,
7044 "mbcs encoding does not support errors='%s'",
7045 errors);
7046 return -1;
7047 }
7048
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007049 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007051 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7052 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 if (mbcssize == 0) {
7054 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7055 return -1;
7056 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007057 /* If we used a default char, then we failed! */
7058 if (pusedDefaultChar && *pusedDefaultChar)
7059 goto mbcs_encode_error;
7060 } else {
7061 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007062 }
7063
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 /* Create string object */
7066 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7067 if (*repr == NULL)
7068 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007069 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070 }
7071 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 /* Extend string object */
7073 n = PyBytes_Size(*repr);
7074 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7075 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 }
7077
7078 /* Do the conversion */
7079 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007081 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7082 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7084 return -1;
7085 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007086 if (pusedDefaultChar && *pusedDefaultChar)
7087 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007090
7091mbcs_encode_error:
7092 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7093 Py_XDECREF(exc);
7094 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007095}
7096
Alexander Belopolsky40018472011-02-26 01:02:56 +00007097PyObject *
7098PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7099 Py_ssize_t size,
7100 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007101{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102 PyObject *repr = NULL;
7103 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007104
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007108 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109 else
7110#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007111 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007112
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 Py_XDECREF(repr);
7115 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007116 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117
7118#ifdef NEED_RETRY
7119 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 p += INT_MAX;
7121 size -= INT_MAX;
7122 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123 }
7124#endif
7125
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007126 return repr;
7127}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007128
Alexander Belopolsky40018472011-02-26 01:02:56 +00007129PyObject *
7130PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007131{
7132 if (!PyUnicode_Check(unicode)) {
7133 PyErr_BadArgument();
7134 return NULL;
7135 }
7136 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 PyUnicode_GET_SIZE(unicode),
7138 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007139}
7140
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141#undef NEED_RETRY
7142
Victor Stinner99b95382011-07-04 14:23:54 +02007143#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007144
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145/* --- Character Mapping Codec -------------------------------------------- */
7146
Alexander Belopolsky40018472011-02-26 01:02:56 +00007147PyObject *
7148PyUnicode_DecodeCharmap(const char *s,
7149 Py_ssize_t size,
7150 PyObject *mapping,
7151 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007153 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007154 Py_ssize_t startinpos;
7155 Py_ssize_t endinpos;
7156 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007157 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 PyUnicodeObject *v;
7159 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007160 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161 PyObject *errorHandler = NULL;
7162 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007163 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007164 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007165
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 /* Default to Latin-1 */
7167 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
7170 v = _PyUnicode_New(size);
7171 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007176 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007177 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 mapstring = PyUnicode_AS_UNICODE(mapping);
7179 maplen = PyUnicode_GET_SIZE(mapping);
7180 while (s < e) {
7181 unsigned char ch = *s;
7182 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 if (ch < maplen)
7185 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 if (x == 0xfffe) {
7188 /* undefined mapping */
7189 outpos = p-PyUnicode_AS_UNICODE(v);
7190 startinpos = s-starts;
7191 endinpos = startinpos+1;
7192 if (unicode_decode_call_errorhandler(
7193 errors, &errorHandler,
7194 "charmap", "character maps to <undefined>",
7195 &starts, &e, &startinpos, &endinpos, &exc, &s,
7196 &v, &outpos, &p)) {
7197 goto onError;
7198 }
7199 continue;
7200 }
7201 *p++ = x;
7202 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007203 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007204 }
7205 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 while (s < e) {
7207 unsigned char ch = *s;
7208 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007209
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7211 w = PyLong_FromLong((long)ch);
7212 if (w == NULL)
7213 goto onError;
7214 x = PyObject_GetItem(mapping, w);
7215 Py_DECREF(w);
7216 if (x == NULL) {
7217 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7218 /* No mapping found means: mapping is undefined. */
7219 PyErr_Clear();
7220 x = Py_None;
7221 Py_INCREF(x);
7222 } else
7223 goto onError;
7224 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007225
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 /* Apply mapping */
7227 if (PyLong_Check(x)) {
7228 long value = PyLong_AS_LONG(x);
7229 if (value < 0 || value > 65535) {
7230 PyErr_SetString(PyExc_TypeError,
7231 "character mapping must be in range(65536)");
7232 Py_DECREF(x);
7233 goto onError;
7234 }
7235 *p++ = (Py_UNICODE)value;
7236 }
7237 else if (x == Py_None) {
7238 /* undefined mapping */
7239 outpos = p-PyUnicode_AS_UNICODE(v);
7240 startinpos = s-starts;
7241 endinpos = startinpos+1;
7242 if (unicode_decode_call_errorhandler(
7243 errors, &errorHandler,
7244 "charmap", "character maps to <undefined>",
7245 &starts, &e, &startinpos, &endinpos, &exc, &s,
7246 &v, &outpos, &p)) {
7247 Py_DECREF(x);
7248 goto onError;
7249 }
7250 Py_DECREF(x);
7251 continue;
7252 }
7253 else if (PyUnicode_Check(x)) {
7254 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007255
Benjamin Peterson29060642009-01-31 22:14:21 +00007256 if (targetsize == 1)
7257 /* 1-1 mapping */
7258 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007259
Benjamin Peterson29060642009-01-31 22:14:21 +00007260 else if (targetsize > 1) {
7261 /* 1-n mapping */
7262 if (targetsize > extrachars) {
7263 /* resize first */
7264 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7265 Py_ssize_t needed = (targetsize - extrachars) + \
7266 (targetsize << 2);
7267 extrachars += needed;
7268 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007269 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 PyUnicode_GET_SIZE(v) + needed) < 0) {
7271 Py_DECREF(x);
7272 goto onError;
7273 }
7274 p = PyUnicode_AS_UNICODE(v) + oldpos;
7275 }
7276 Py_UNICODE_COPY(p,
7277 PyUnicode_AS_UNICODE(x),
7278 targetsize);
7279 p += targetsize;
7280 extrachars -= targetsize;
7281 }
7282 /* 1-0 mapping: skip the character */
7283 }
7284 else {
7285 /* wrong return value */
7286 PyErr_SetString(PyExc_TypeError,
7287 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007288 Py_DECREF(x);
7289 goto onError;
7290 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 Py_DECREF(x);
7292 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 }
7295 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007296 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007298 Py_XDECREF(errorHandler);
7299 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007300#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007301 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007302 Py_DECREF(v);
7303 return NULL;
7304 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007305#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007306 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007308
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007310 Py_XDECREF(errorHandler);
7311 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 Py_XDECREF(v);
7313 return NULL;
7314}
7315
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007316/* Charmap encoding: the lookup table */
7317
Alexander Belopolsky40018472011-02-26 01:02:56 +00007318struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 PyObject_HEAD
7320 unsigned char level1[32];
7321 int count2, count3;
7322 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007323};
7324
7325static PyObject*
7326encoding_map_size(PyObject *obj, PyObject* args)
7327{
7328 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007329 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007331}
7332
7333static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007334 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 PyDoc_STR("Return the size (in bytes) of this object") },
7336 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007337};
7338
7339static void
7340encoding_map_dealloc(PyObject* o)
7341{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007342 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007343}
7344
7345static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 "EncodingMap", /*tp_name*/
7348 sizeof(struct encoding_map), /*tp_basicsize*/
7349 0, /*tp_itemsize*/
7350 /* methods */
7351 encoding_map_dealloc, /*tp_dealloc*/
7352 0, /*tp_print*/
7353 0, /*tp_getattr*/
7354 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007355 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 0, /*tp_repr*/
7357 0, /*tp_as_number*/
7358 0, /*tp_as_sequence*/
7359 0, /*tp_as_mapping*/
7360 0, /*tp_hash*/
7361 0, /*tp_call*/
7362 0, /*tp_str*/
7363 0, /*tp_getattro*/
7364 0, /*tp_setattro*/
7365 0, /*tp_as_buffer*/
7366 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7367 0, /*tp_doc*/
7368 0, /*tp_traverse*/
7369 0, /*tp_clear*/
7370 0, /*tp_richcompare*/
7371 0, /*tp_weaklistoffset*/
7372 0, /*tp_iter*/
7373 0, /*tp_iternext*/
7374 encoding_map_methods, /*tp_methods*/
7375 0, /*tp_members*/
7376 0, /*tp_getset*/
7377 0, /*tp_base*/
7378 0, /*tp_dict*/
7379 0, /*tp_descr_get*/
7380 0, /*tp_descr_set*/
7381 0, /*tp_dictoffset*/
7382 0, /*tp_init*/
7383 0, /*tp_alloc*/
7384 0, /*tp_new*/
7385 0, /*tp_free*/
7386 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007387};
7388
7389PyObject*
7390PyUnicode_BuildEncodingMap(PyObject* string)
7391{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007392 PyObject *result;
7393 struct encoding_map *mresult;
7394 int i;
7395 int need_dict = 0;
7396 unsigned char level1[32];
7397 unsigned char level2[512];
7398 unsigned char *mlevel1, *mlevel2, *mlevel3;
7399 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007400 int kind;
7401 void *data;
7402 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007404 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007405 PyErr_BadArgument();
7406 return NULL;
7407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007408 kind = PyUnicode_KIND(string);
7409 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007410 memset(level1, 0xFF, sizeof level1);
7411 memset(level2, 0xFF, sizeof level2);
7412
7413 /* If there isn't a one-to-one mapping of NULL to \0,
7414 or if there are non-BMP characters, we need to use
7415 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007416 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007417 need_dict = 1;
7418 for (i = 1; i < 256; i++) {
7419 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 ch = PyUnicode_READ(kind, data, i);
7421 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007422 need_dict = 1;
7423 break;
7424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007425 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007426 /* unmapped character */
7427 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007428 l1 = ch >> 11;
7429 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007430 if (level1[l1] == 0xFF)
7431 level1[l1] = count2++;
7432 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007433 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007434 }
7435
7436 if (count2 >= 0xFF || count3 >= 0xFF)
7437 need_dict = 1;
7438
7439 if (need_dict) {
7440 PyObject *result = PyDict_New();
7441 PyObject *key, *value;
7442 if (!result)
7443 return NULL;
7444 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007446 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007447 if (!key || !value)
7448 goto failed1;
7449 if (PyDict_SetItem(result, key, value) == -1)
7450 goto failed1;
7451 Py_DECREF(key);
7452 Py_DECREF(value);
7453 }
7454 return result;
7455 failed1:
7456 Py_XDECREF(key);
7457 Py_XDECREF(value);
7458 Py_DECREF(result);
7459 return NULL;
7460 }
7461
7462 /* Create a three-level trie */
7463 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7464 16*count2 + 128*count3 - 1);
7465 if (!result)
7466 return PyErr_NoMemory();
7467 PyObject_Init(result, &EncodingMapType);
7468 mresult = (struct encoding_map*)result;
7469 mresult->count2 = count2;
7470 mresult->count3 = count3;
7471 mlevel1 = mresult->level1;
7472 mlevel2 = mresult->level23;
7473 mlevel3 = mresult->level23 + 16*count2;
7474 memcpy(mlevel1, level1, 32);
7475 memset(mlevel2, 0xFF, 16*count2);
7476 memset(mlevel3, 0, 128*count3);
7477 count3 = 0;
7478 for (i = 1; i < 256; i++) {
7479 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007480 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007481 /* unmapped character */
7482 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007483 o1 = PyUnicode_READ(kind, data, i)>>11;
7484 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007485 i2 = 16*mlevel1[o1] + o2;
7486 if (mlevel2[i2] == 0xFF)
7487 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007488 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007489 i3 = 128*mlevel2[i2] + o3;
7490 mlevel3[i3] = i;
7491 }
7492 return result;
7493}
7494
7495static int
7496encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7497{
7498 struct encoding_map *map = (struct encoding_map*)mapping;
7499 int l1 = c>>11;
7500 int l2 = (c>>7) & 0xF;
7501 int l3 = c & 0x7F;
7502 int i;
7503
7504#ifdef Py_UNICODE_WIDE
7505 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007507 }
7508#endif
7509 if (c == 0)
7510 return 0;
7511 /* level 1*/
7512 i = map->level1[l1];
7513 if (i == 0xFF) {
7514 return -1;
7515 }
7516 /* level 2*/
7517 i = map->level23[16*i+l2];
7518 if (i == 0xFF) {
7519 return -1;
7520 }
7521 /* level 3 */
7522 i = map->level23[16*map->count2 + 128*i + l3];
7523 if (i == 0) {
7524 return -1;
7525 }
7526 return i;
7527}
7528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007529/* Lookup the character ch in the mapping. If the character
7530 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007531 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007532static PyObject *
7533charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534{
Christian Heimes217cfd12007-12-02 14:31:20 +00007535 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007536 PyObject *x;
7537
7538 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007540 x = PyObject_GetItem(mapping, w);
7541 Py_DECREF(w);
7542 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7544 /* No mapping found means: mapping is undefined. */
7545 PyErr_Clear();
7546 x = Py_None;
7547 Py_INCREF(x);
7548 return x;
7549 } else
7550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007552 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007554 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 long value = PyLong_AS_LONG(x);
7556 if (value < 0 || value > 255) {
7557 PyErr_SetString(PyExc_TypeError,
7558 "character mapping must be in range(256)");
7559 Py_DECREF(x);
7560 return NULL;
7561 }
7562 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007564 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 /* wrong return value */
7568 PyErr_Format(PyExc_TypeError,
7569 "character mapping must return integer, bytes or None, not %.400s",
7570 x->ob_type->tp_name);
7571 Py_DECREF(x);
7572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 }
7574}
7575
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007576static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007577charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007578{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007579 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7580 /* exponentially overallocate to minimize reallocations */
7581 if (requiredsize < 2*outsize)
7582 requiredsize = 2*outsize;
7583 if (_PyBytes_Resize(outobj, requiredsize))
7584 return -1;
7585 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007586}
7587
Benjamin Peterson14339b62009-01-31 16:36:08 +00007588typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007590} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007592 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007593 space is available. Return a new reference to the object that
7594 was put in the output buffer, or Py_None, if the mapping was undefined
7595 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007596 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007597static charmapencode_result
7598charmapencode_output(Py_UNICODE c, PyObject *mapping,
7599 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007600{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007601 PyObject *rep;
7602 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007603 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007604
Christian Heimes90aa7642007-12-19 02:45:37 +00007605 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007606 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007608 if (res == -1)
7609 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 if (outsize<requiredsize)
7611 if (charmapencode_resize(outobj, outpos, requiredsize))
7612 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007613 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 outstart[(*outpos)++] = (char)res;
7615 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007616 }
7617
7618 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007619 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 Py_DECREF(rep);
7623 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007624 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 if (PyLong_Check(rep)) {
7626 Py_ssize_t requiredsize = *outpos+1;
7627 if (outsize<requiredsize)
7628 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7629 Py_DECREF(rep);
7630 return enc_EXCEPTION;
7631 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007632 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 else {
7636 const char *repchars = PyBytes_AS_STRING(rep);
7637 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7638 Py_ssize_t requiredsize = *outpos+repsize;
7639 if (outsize<requiredsize)
7640 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7641 Py_DECREF(rep);
7642 return enc_EXCEPTION;
7643 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007644 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 memcpy(outstart + *outpos, repchars, repsize);
7646 *outpos += repsize;
7647 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007648 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007649 Py_DECREF(rep);
7650 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007651}
7652
7653/* handle an error in PyUnicode_EncodeCharmap
7654 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007655static int
7656charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007657 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007659 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007660 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661{
7662 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007663 Py_ssize_t repsize;
7664 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665 Py_UNICODE *uni2;
7666 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007667 Py_ssize_t collstartpos = *inpos;
7668 Py_ssize_t collendpos = *inpos+1;
7669 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007670 char *encoding = "charmap";
7671 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007672 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007674 /* find all unencodable characters */
7675 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007677 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 int res = encoding_map_lookup(p[collendpos], mapping);
7679 if (res != -1)
7680 break;
7681 ++collendpos;
7682 continue;
7683 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007684
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 rep = charmapencode_lookup(p[collendpos], mapping);
7686 if (rep==NULL)
7687 return -1;
7688 else if (rep!=Py_None) {
7689 Py_DECREF(rep);
7690 break;
7691 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007692 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 }
7695 /* cache callback name lookup
7696 * (if not done yet, i.e. it's the first error) */
7697 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 if ((errors==NULL) || (!strcmp(errors, "strict")))
7699 *known_errorHandler = 1;
7700 else if (!strcmp(errors, "replace"))
7701 *known_errorHandler = 2;
7702 else if (!strcmp(errors, "ignore"))
7703 *known_errorHandler = 3;
7704 else if (!strcmp(errors, "xmlcharrefreplace"))
7705 *known_errorHandler = 4;
7706 else
7707 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708 }
7709 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007710 case 1: /* strict */
7711 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7712 return -1;
7713 case 2: /* replace */
7714 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 x = charmapencode_output('?', mapping, res, respos);
7716 if (x==enc_EXCEPTION) {
7717 return -1;
7718 }
7719 else if (x==enc_FAILED) {
7720 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7721 return -1;
7722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007723 }
7724 /* fall through */
7725 case 3: /* ignore */
7726 *inpos = collendpos;
7727 break;
7728 case 4: /* xmlcharrefreplace */
7729 /* generate replacement (temporarily (mis)uses p) */
7730 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 char buffer[2+29+1+1];
7732 char *cp;
7733 sprintf(buffer, "&#%d;", (int)p[collpos]);
7734 for (cp = buffer; *cp; ++cp) {
7735 x = charmapencode_output(*cp, mapping, res, respos);
7736 if (x==enc_EXCEPTION)
7737 return -1;
7738 else if (x==enc_FAILED) {
7739 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7740 return -1;
7741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007742 }
7743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 *inpos = collendpos;
7745 break;
7746 default:
7747 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 encoding, reason, p, size, exceptionObject,
7749 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007750 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007752 if (PyBytes_Check(repunicode)) {
7753 /* Directly copy bytes result to output. */
7754 Py_ssize_t outsize = PyBytes_Size(*res);
7755 Py_ssize_t requiredsize;
7756 repsize = PyBytes_Size(repunicode);
7757 requiredsize = *respos + repsize;
7758 if (requiredsize > outsize)
7759 /* Make room for all additional bytes. */
7760 if (charmapencode_resize(res, respos, requiredsize)) {
7761 Py_DECREF(repunicode);
7762 return -1;
7763 }
7764 memcpy(PyBytes_AsString(*res) + *respos,
7765 PyBytes_AsString(repunicode), repsize);
7766 *respos += repsize;
7767 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007768 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007769 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 /* generate replacement */
7772 repsize = PyUnicode_GET_SIZE(repunicode);
7773 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 x = charmapencode_output(*uni2, mapping, res, respos);
7775 if (x==enc_EXCEPTION) {
7776 return -1;
7777 }
7778 else if (x==enc_FAILED) {
7779 Py_DECREF(repunicode);
7780 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7781 return -1;
7782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007783 }
7784 *inpos = newpos;
7785 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 }
7787 return 0;
7788}
7789
Alexander Belopolsky40018472011-02-26 01:02:56 +00007790PyObject *
7791PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7792 Py_ssize_t size,
7793 PyObject *mapping,
7794 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796 /* output object */
7797 PyObject *res = NULL;
7798 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007800 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007802 PyObject *errorHandler = NULL;
7803 PyObject *exc = NULL;
7804 /* the following variable is used for caching string comparisons
7805 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7806 * 3=ignore, 4=xmlcharrefreplace */
7807 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
7809 /* Default to Latin-1 */
7810 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007813 /* allocate enough for a simple encoding without
7814 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007815 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816 if (res == NULL)
7817 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007818 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 /* try to encode it */
7823 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7824 if (x==enc_EXCEPTION) /* error */
7825 goto onError;
7826 if (x==enc_FAILED) { /* unencodable character */
7827 if (charmap_encoding_error(p, size, &inpos, mapping,
7828 &exc,
7829 &known_errorHandler, &errorHandler, errors,
7830 &res, &respos)) {
7831 goto onError;
7832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 else
7835 /* done with this character => adjust input position */
7836 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007840 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007841 if (_PyBytes_Resize(&res, respos) < 0)
7842 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007844 Py_XDECREF(exc);
7845 Py_XDECREF(errorHandler);
7846 return res;
7847
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007849 Py_XDECREF(res);
7850 Py_XDECREF(exc);
7851 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 return NULL;
7853}
7854
Alexander Belopolsky40018472011-02-26 01:02:56 +00007855PyObject *
7856PyUnicode_AsCharmapString(PyObject *unicode,
7857 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858{
7859 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 PyErr_BadArgument();
7861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 }
7863 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 PyUnicode_GET_SIZE(unicode),
7865 mapping,
7866 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867}
7868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007869/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007870static void
7871make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007872 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007873 Py_ssize_t startpos, Py_ssize_t endpos,
7874 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 *exceptionObject = _PyUnicodeTranslateError_Create(
7878 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 }
7880 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7882 goto onError;
7883 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7884 goto onError;
7885 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7886 goto onError;
7887 return;
7888 onError:
7889 Py_DECREF(*exceptionObject);
7890 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 }
7892}
7893
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007895static void
7896raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007897 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007898 Py_ssize_t startpos, Py_ssize_t endpos,
7899 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007900{
7901 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007903 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007905}
7906
7907/* error handling callback helper:
7908 build arguments, call the callback and check the arguments,
7909 put the result into newpos and return the replacement string, which
7910 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007911static PyObject *
7912unicode_translate_call_errorhandler(const char *errors,
7913 PyObject **errorHandler,
7914 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007916 Py_ssize_t startpos, Py_ssize_t endpos,
7917 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007918{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007919 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007920
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007921 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007922 PyObject *restuple;
7923 PyObject *resunicode;
7924
7925 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007927 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007929 }
7930
7931 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935
7936 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007940 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007941 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 Py_DECREF(restuple);
7943 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007944 }
7945 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 &resunicode, &i_newpos)) {
7947 Py_DECREF(restuple);
7948 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007951 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 else
7953 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007954 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7956 Py_DECREF(restuple);
7957 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959 Py_INCREF(resunicode);
7960 Py_DECREF(restuple);
7961 return resunicode;
7962}
7963
7964/* Lookup the character ch in the mapping and put the result in result,
7965 which must be decrefed by the caller.
7966 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007967static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007968charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969{
Christian Heimes217cfd12007-12-02 14:31:20 +00007970 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007971 PyObject *x;
7972
7973 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007975 x = PyObject_GetItem(mapping, w);
7976 Py_DECREF(w);
7977 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7979 /* No mapping found means: use 1:1 mapping. */
7980 PyErr_Clear();
7981 *result = NULL;
7982 return 0;
7983 } else
7984 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007985 }
7986 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 *result = x;
7988 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007990 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 long value = PyLong_AS_LONG(x);
7992 long max = PyUnicode_GetMax();
7993 if (value < 0 || value > max) {
7994 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007995 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 Py_DECREF(x);
7997 return -1;
7998 }
7999 *result = x;
8000 return 0;
8001 }
8002 else if (PyUnicode_Check(x)) {
8003 *result = x;
8004 return 0;
8005 }
8006 else {
8007 /* wrong return value */
8008 PyErr_SetString(PyExc_TypeError,
8009 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008010 Py_DECREF(x);
8011 return -1;
8012 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013}
8014/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 if not reallocate and adjust various state variables.
8016 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008017static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008018charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008021 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008022 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 /* exponentially overallocate to minimize reallocations */
8024 if (requiredsize < 2 * oldsize)
8025 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8027 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030 }
8031 return 0;
8032}
8033/* lookup the character, put the result in the output string and adjust
8034 various state variables. Return a new reference to the object that
8035 was put in the output buffer in *result, or Py_None, if the mapping was
8036 undefined (in which case no character was written).
8037 The called must decref result.
8038 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008039static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008040charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8041 PyObject *mapping, Py_UCS4 **output,
8042 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008043 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008045 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8046 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008048 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008051 }
8052 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008054 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008056 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008057 }
8058 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008059 Py_ssize_t repsize;
8060 if (PyUnicode_READY(*res) == -1)
8061 return -1;
8062 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 if (repsize==1) {
8064 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008065 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 }
8067 else if (repsize!=0) {
8068 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008069 Py_ssize_t requiredsize = *opos +
8070 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008072 Py_ssize_t i;
8073 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 for(i = 0; i < repsize; i++)
8076 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 }
8079 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081 return 0;
8082}
8083
Alexander Belopolsky40018472011-02-26 01:02:56 +00008084PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085_PyUnicode_TranslateCharmap(PyObject *input,
8086 PyObject *mapping,
8087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 /* input object */
8090 char *idata;
8091 Py_ssize_t size, i;
8092 int kind;
8093 /* output buffer */
8094 Py_UCS4 *output = NULL;
8095 Py_ssize_t osize;
8096 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 char *reason = "character maps to <undefined>";
8100 PyObject *errorHandler = NULL;
8101 PyObject *exc = NULL;
8102 /* the following variable is used for caching string comparisons
8103 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8104 * 3=ignore, 4=xmlcharrefreplace */
8105 int known_errorHandler = -1;
8106
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 PyErr_BadArgument();
8109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 if (PyUnicode_READY(input) == -1)
8113 return NULL;
8114 idata = (char*)PyUnicode_DATA(input);
8115 kind = PyUnicode_KIND(input);
8116 size = PyUnicode_GET_LENGTH(input);
8117 i = 0;
8118
8119 if (size == 0) {
8120 Py_INCREF(input);
8121 return input;
8122 }
8123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008124 /* allocate enough for a simple 1:1 translation without
8125 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 osize = size;
8127 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8128 opos = 0;
8129 if (output == NULL) {
8130 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 /* try to encode it */
8136 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 if (charmaptranslate_output(input, i, mapping,
8138 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 Py_XDECREF(x);
8140 goto onError;
8141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 else { /* untranslatable character */
8146 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8147 Py_ssize_t repsize;
8148 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 Py_ssize_t collstart = i;
8152 Py_ssize_t collend = i+1;
8153 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 while (collend < size) {
8157 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 goto onError;
8159 Py_XDECREF(x);
8160 if (x!=Py_None)
8161 break;
8162 ++collend;
8163 }
8164 /* cache callback name lookup
8165 * (if not done yet, i.e. it's the first error) */
8166 if (known_errorHandler==-1) {
8167 if ((errors==NULL) || (!strcmp(errors, "strict")))
8168 known_errorHandler = 1;
8169 else if (!strcmp(errors, "replace"))
8170 known_errorHandler = 2;
8171 else if (!strcmp(errors, "ignore"))
8172 known_errorHandler = 3;
8173 else if (!strcmp(errors, "xmlcharrefreplace"))
8174 known_errorHandler = 4;
8175 else
8176 known_errorHandler = 0;
8177 }
8178 switch (known_errorHandler) {
8179 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 raise_translate_exception(&exc, input, collstart,
8181 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 case 2: /* replace */
8184 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185 for (coll = collstart; coll<collend; coll++)
8186 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 /* fall through */
8188 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 break;
8191 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 /* generate replacement (temporarily (mis)uses i) */
8193 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 char buffer[2+29+1+1];
8195 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8197 if (charmaptranslate_makespace(&output, &osize,
8198 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 goto onError;
8200 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 break;
8205 default:
8206 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207 reason, input, &exc,
8208 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008209 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 goto onError;
8211 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 repsize = PyUnicode_GET_LENGTH(repunicode);
8213 if (charmaptranslate_makespace(&output, &osize,
8214 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 Py_DECREF(repunicode);
8216 goto onError;
8217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 for (uni2 = 0; repsize-->0; ++uni2)
8219 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8220 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 }
8224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8226 if (!res)
8227 goto onError;
8228 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 Py_XDECREF(exc);
8230 Py_XDECREF(errorHandler);
8231 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 Py_XDECREF(exc);
8236 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 return NULL;
8238}
8239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240/* Deprecated. Use PyUnicode_Translate instead. */
8241PyObject *
8242PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8243 Py_ssize_t size,
8244 PyObject *mapping,
8245 const char *errors)
8246{
8247 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8248 if (!unicode)
8249 return NULL;
8250 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8251}
8252
Alexander Belopolsky40018472011-02-26 01:02:56 +00008253PyObject *
8254PyUnicode_Translate(PyObject *str,
8255 PyObject *mapping,
8256 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257{
8258 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008259
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 str = PyUnicode_FromObject(str);
8261 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 Py_DECREF(str);
8265 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008266
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 Py_XDECREF(str);
8269 return NULL;
8270}
Tim Petersced69f82003-09-16 20:30:58 +00008271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008273fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274{
8275 /* No need to call PyUnicode_READY(self) because this function is only
8276 called as a callback from fixup() which does it already. */
8277 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8278 const int kind = PyUnicode_KIND(self);
8279 void *data = PyUnicode_DATA(self);
8280 Py_UCS4 maxchar = 0, ch, fixed;
8281 Py_ssize_t i;
8282
8283 for (i = 0; i < len; ++i) {
8284 ch = PyUnicode_READ(kind, data, i);
8285 fixed = 0;
8286 if (ch > 127) {
8287 if (Py_UNICODE_ISSPACE(ch))
8288 fixed = ' ';
8289 else {
8290 const int decimal = Py_UNICODE_TODECIMAL(ch);
8291 if (decimal >= 0)
8292 fixed = '0' + decimal;
8293 }
8294 if (fixed != 0) {
8295 if (fixed > maxchar)
8296 maxchar = fixed;
8297 PyUnicode_WRITE(kind, data, i, fixed);
8298 }
8299 else if (ch > maxchar)
8300 maxchar = ch;
8301 }
8302 else if (ch > maxchar)
8303 maxchar = ch;
8304 }
8305
8306 return maxchar;
8307}
8308
8309PyObject *
8310_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8311{
8312 if (!PyUnicode_Check(unicode)) {
8313 PyErr_BadInternalCall();
8314 return NULL;
8315 }
8316 if (PyUnicode_READY(unicode) == -1)
8317 return NULL;
8318 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8319 /* If the string is already ASCII, just return the same string */
8320 Py_INCREF(unicode);
8321 return unicode;
8322 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008323 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324}
8325
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008326PyObject *
8327PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8328 Py_ssize_t length)
8329{
8330 PyObject *result;
8331 Py_UNICODE *p; /* write pointer into result */
8332 Py_ssize_t i;
8333 /* Copy to a new string */
8334 result = (PyObject *)_PyUnicode_New(length);
8335 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8336 if (result == NULL)
8337 return result;
8338 p = PyUnicode_AS_UNICODE(result);
8339 /* Iterate over code points */
8340 for (i = 0; i < length; i++) {
8341 Py_UNICODE ch =s[i];
8342 if (ch > 127) {
8343 int decimal = Py_UNICODE_TODECIMAL(ch);
8344 if (decimal >= 0)
8345 p[i] = '0' + decimal;
8346 }
8347 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008348#ifndef DONT_MAKE_RESULT_READY
8349 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 Py_DECREF(result);
8351 return NULL;
8352 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008353#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008354 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008355 return result;
8356}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008357/* --- Decimal Encoder ---------------------------------------------------- */
8358
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359int
8360PyUnicode_EncodeDecimal(Py_UNICODE *s,
8361 Py_ssize_t length,
8362 char *output,
8363 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008364{
8365 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 PyObject *errorHandler = NULL;
8367 PyObject *exc = NULL;
8368 const char *encoding = "decimal";
8369 const char *reason = "invalid decimal Unicode string";
8370 /* the following variable is used for caching string comparisons
8371 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8372 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008373
8374 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 PyErr_BadArgument();
8376 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008377 }
8378
8379 p = s;
8380 end = s + length;
8381 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 register Py_UNICODE ch = *p;
8383 int decimal;
8384 PyObject *repunicode;
8385 Py_ssize_t repsize;
8386 Py_ssize_t newpos;
8387 Py_UNICODE *uni2;
8388 Py_UNICODE *collstart;
8389 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008390
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008392 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 ++p;
8394 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008395 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 decimal = Py_UNICODE_TODECIMAL(ch);
8397 if (decimal >= 0) {
8398 *output++ = '0' + decimal;
8399 ++p;
8400 continue;
8401 }
8402 if (0 < ch && ch < 256) {
8403 *output++ = (char)ch;
8404 ++p;
8405 continue;
8406 }
8407 /* All other characters are considered unencodable */
8408 collstart = p;
8409 collend = p+1;
8410 while (collend < end) {
8411 if ((0 < *collend && *collend < 256) ||
8412 !Py_UNICODE_ISSPACE(*collend) ||
8413 Py_UNICODE_TODECIMAL(*collend))
8414 break;
8415 }
8416 /* cache callback name lookup
8417 * (if not done yet, i.e. it's the first error) */
8418 if (known_errorHandler==-1) {
8419 if ((errors==NULL) || (!strcmp(errors, "strict")))
8420 known_errorHandler = 1;
8421 else if (!strcmp(errors, "replace"))
8422 known_errorHandler = 2;
8423 else if (!strcmp(errors, "ignore"))
8424 known_errorHandler = 3;
8425 else if (!strcmp(errors, "xmlcharrefreplace"))
8426 known_errorHandler = 4;
8427 else
8428 known_errorHandler = 0;
8429 }
8430 switch (known_errorHandler) {
8431 case 1: /* strict */
8432 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8433 goto onError;
8434 case 2: /* replace */
8435 for (p = collstart; p < collend; ++p)
8436 *output++ = '?';
8437 /* fall through */
8438 case 3: /* ignore */
8439 p = collend;
8440 break;
8441 case 4: /* xmlcharrefreplace */
8442 /* generate replacement (temporarily (mis)uses p) */
8443 for (p = collstart; p < collend; ++p)
8444 output += sprintf(output, "&#%d;", (int)*p);
8445 p = collend;
8446 break;
8447 default:
8448 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8449 encoding, reason, s, length, &exc,
8450 collstart-s, collend-s, &newpos);
8451 if (repunicode == NULL)
8452 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008453 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008454 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008455 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8456 Py_DECREF(repunicode);
8457 goto onError;
8458 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 /* generate replacement */
8460 repsize = PyUnicode_GET_SIZE(repunicode);
8461 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8462 Py_UNICODE ch = *uni2;
8463 if (Py_UNICODE_ISSPACE(ch))
8464 *output++ = ' ';
8465 else {
8466 decimal = Py_UNICODE_TODECIMAL(ch);
8467 if (decimal >= 0)
8468 *output++ = '0' + decimal;
8469 else if (0 < ch && ch < 256)
8470 *output++ = (char)ch;
8471 else {
8472 Py_DECREF(repunicode);
8473 raise_encode_exception(&exc, encoding,
8474 s, length, collstart-s, collend-s, reason);
8475 goto onError;
8476 }
8477 }
8478 }
8479 p = s + newpos;
8480 Py_DECREF(repunicode);
8481 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008482 }
8483 /* 0-terminate the output string */
8484 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485 Py_XDECREF(exc);
8486 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008487 return 0;
8488
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 Py_XDECREF(exc);
8491 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008492 return -1;
8493}
8494
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495/* --- Helpers ------------------------------------------------------------ */
8496
Victor Stinnerc3cec782011-10-05 21:24:08 +02008497#include "stringlib/asciilib.h"
8498#include "stringlib/fastsearch.h"
8499#include "stringlib/partition.h"
8500#include "stringlib/split.h"
8501#include "stringlib/count.h"
8502#include "stringlib/find.h"
8503#include "stringlib/localeutil.h"
8504#include "stringlib/undef.h"
8505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506#include "stringlib/ucs1lib.h"
8507#include "stringlib/fastsearch.h"
8508#include "stringlib/partition.h"
8509#include "stringlib/split.h"
8510#include "stringlib/count.h"
8511#include "stringlib/find.h"
8512#include "stringlib/localeutil.h"
8513#include "stringlib/undef.h"
8514
8515#include "stringlib/ucs2lib.h"
8516#include "stringlib/fastsearch.h"
8517#include "stringlib/partition.h"
8518#include "stringlib/split.h"
8519#include "stringlib/count.h"
8520#include "stringlib/find.h"
8521#include "stringlib/localeutil.h"
8522#include "stringlib/undef.h"
8523
8524#include "stringlib/ucs4lib.h"
8525#include "stringlib/fastsearch.h"
8526#include "stringlib/partition.h"
8527#include "stringlib/split.h"
8528#include "stringlib/count.h"
8529#include "stringlib/find.h"
8530#include "stringlib/localeutil.h"
8531#include "stringlib/undef.h"
8532
8533static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008534any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8535 const Py_UCS1*, Py_ssize_t,
8536 Py_ssize_t, Py_ssize_t),
8537 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 const Py_UCS1*, Py_ssize_t,
8539 Py_ssize_t, Py_ssize_t),
8540 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8541 const Py_UCS2*, Py_ssize_t,
8542 Py_ssize_t, Py_ssize_t),
8543 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8544 const Py_UCS4*, Py_ssize_t,
8545 Py_ssize_t, Py_ssize_t),
8546 PyObject* s1, PyObject* s2,
8547 Py_ssize_t start,
8548 Py_ssize_t end)
8549{
8550 int kind1, kind2, kind;
8551 void *buf1, *buf2;
8552 Py_ssize_t len1, len2, result;
8553
8554 kind1 = PyUnicode_KIND(s1);
8555 kind2 = PyUnicode_KIND(s2);
8556 kind = kind1 > kind2 ? kind1 : kind2;
8557 buf1 = PyUnicode_DATA(s1);
8558 buf2 = PyUnicode_DATA(s2);
8559 if (kind1 != kind)
8560 buf1 = _PyUnicode_AsKind(s1, kind);
8561 if (!buf1)
8562 return -2;
8563 if (kind2 != kind)
8564 buf2 = _PyUnicode_AsKind(s2, kind);
8565 if (!buf2) {
8566 if (kind1 != kind) PyMem_Free(buf1);
8567 return -2;
8568 }
8569 len1 = PyUnicode_GET_LENGTH(s1);
8570 len2 = PyUnicode_GET_LENGTH(s2);
8571
8572 switch(kind) {
8573 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008574 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8575 result = ascii(buf1, len1, buf2, len2, start, end);
8576 else
8577 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 break;
8579 case PyUnicode_2BYTE_KIND:
8580 result = ucs2(buf1, len1, buf2, len2, start, end);
8581 break;
8582 case PyUnicode_4BYTE_KIND:
8583 result = ucs4(buf1, len1, buf2, len2, start, end);
8584 break;
8585 default:
8586 assert(0); result = -2;
8587 }
8588
8589 if (kind1 != kind)
8590 PyMem_Free(buf1);
8591 if (kind2 != kind)
8592 PyMem_Free(buf2);
8593
8594 return result;
8595}
8596
8597Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008598_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 Py_ssize_t n_buffer,
8600 void *digits, Py_ssize_t n_digits,
8601 Py_ssize_t min_width,
8602 const char *grouping,
8603 const char *thousands_sep)
8604{
8605 switch(kind) {
8606 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008607 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8608 return _PyUnicode_ascii_InsertThousandsGrouping(
8609 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8610 min_width, grouping, thousands_sep);
8611 else
8612 return _PyUnicode_ucs1_InsertThousandsGrouping(
8613 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8614 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 case PyUnicode_2BYTE_KIND:
8616 return _PyUnicode_ucs2_InsertThousandsGrouping(
8617 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8618 min_width, grouping, thousands_sep);
8619 case PyUnicode_4BYTE_KIND:
8620 return _PyUnicode_ucs4_InsertThousandsGrouping(
8621 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8622 min_width, grouping, thousands_sep);
8623 }
8624 assert(0);
8625 return -1;
8626}
8627
8628
Eric Smith8c663262007-08-25 02:26:07 +00008629#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008630#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008631
Thomas Wouters477c8d52006-05-27 19:21:47 +00008632#include "stringlib/count.h"
8633#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008634
Thomas Wouters477c8d52006-05-27 19:21:47 +00008635/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008636#define ADJUST_INDICES(start, end, len) \
8637 if (end > len) \
8638 end = len; \
8639 else if (end < 0) { \
8640 end += len; \
8641 if (end < 0) \
8642 end = 0; \
8643 } \
8644 if (start < 0) { \
8645 start += len; \
8646 if (start < 0) \
8647 start = 0; \
8648 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008649
Alexander Belopolsky40018472011-02-26 01:02:56 +00008650Py_ssize_t
8651PyUnicode_Count(PyObject *str,
8652 PyObject *substr,
8653 Py_ssize_t start,
8654 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008656 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008657 PyUnicodeObject* str_obj;
8658 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 int kind1, kind2, kind;
8660 void *buf1 = NULL, *buf2 = NULL;
8661 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008662
Thomas Wouters477c8d52006-05-27 19:21:47 +00008663 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008666 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008667 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 Py_DECREF(str_obj);
8669 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 }
Tim Petersced69f82003-09-16 20:30:58 +00008671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 kind1 = PyUnicode_KIND(str_obj);
8673 kind2 = PyUnicode_KIND(sub_obj);
8674 kind = kind1 > kind2 ? kind1 : kind2;
8675 buf1 = PyUnicode_DATA(str_obj);
8676 if (kind1 != kind)
8677 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8678 if (!buf1)
8679 goto onError;
8680 buf2 = PyUnicode_DATA(sub_obj);
8681 if (kind2 != kind)
8682 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8683 if (!buf2)
8684 goto onError;
8685 len1 = PyUnicode_GET_LENGTH(str_obj);
8686 len2 = PyUnicode_GET_LENGTH(sub_obj);
8687
8688 ADJUST_INDICES(start, end, len1);
8689 switch(kind) {
8690 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008691 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8692 result = asciilib_count(
8693 ((Py_UCS1*)buf1) + start, end - start,
8694 buf2, len2, PY_SSIZE_T_MAX
8695 );
8696 else
8697 result = ucs1lib_count(
8698 ((Py_UCS1*)buf1) + start, end - start,
8699 buf2, len2, PY_SSIZE_T_MAX
8700 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 break;
8702 case PyUnicode_2BYTE_KIND:
8703 result = ucs2lib_count(
8704 ((Py_UCS2*)buf1) + start, end - start,
8705 buf2, len2, PY_SSIZE_T_MAX
8706 );
8707 break;
8708 case PyUnicode_4BYTE_KIND:
8709 result = ucs4lib_count(
8710 ((Py_UCS4*)buf1) + start, end - start,
8711 buf2, len2, PY_SSIZE_T_MAX
8712 );
8713 break;
8714 default:
8715 assert(0); result = 0;
8716 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008717
8718 Py_DECREF(sub_obj);
8719 Py_DECREF(str_obj);
8720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721 if (kind1 != kind)
8722 PyMem_Free(buf1);
8723 if (kind2 != kind)
8724 PyMem_Free(buf2);
8725
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 onError:
8728 Py_DECREF(sub_obj);
8729 Py_DECREF(str_obj);
8730 if (kind1 != kind && buf1)
8731 PyMem_Free(buf1);
8732 if (kind2 != kind && buf2)
8733 PyMem_Free(buf2);
8734 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735}
8736
Alexander Belopolsky40018472011-02-26 01:02:56 +00008737Py_ssize_t
8738PyUnicode_Find(PyObject *str,
8739 PyObject *sub,
8740 Py_ssize_t start,
8741 Py_ssize_t end,
8742 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008744 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008745
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008749 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 Py_DECREF(str);
8752 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 }
Tim Petersced69f82003-09-16 20:30:58 +00008754
Thomas Wouters477c8d52006-05-27 19:21:47 +00008755 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008757 asciilib_find_slice, ucs1lib_find_slice,
8758 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008760 );
8761 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 result = any_find_slice(
Antoine Pitroueaf139b2011-10-09 00:33:09 +02008763 asciilib_rfind_slice, ucs1lib_rfind_slice,
Victor Stinnerc3cec782011-10-05 21:24:08 +02008764 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008766 );
8767
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008769 Py_DECREF(sub);
8770
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 return result;
8772}
8773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774Py_ssize_t
8775PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8776 Py_ssize_t start, Py_ssize_t end,
8777 int direction)
8778{
8779 char *result;
8780 int kind;
8781 if (PyUnicode_READY(str) == -1)
8782 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008783 if (start < 0 || end < 0) {
8784 PyErr_SetString(PyExc_IndexError, "string index out of range");
8785 return -2;
8786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 if (end > PyUnicode_GET_LENGTH(str))
8788 end = PyUnicode_GET_LENGTH(str);
8789 kind = PyUnicode_KIND(str);
8790 result = findchar(PyUnicode_1BYTE_DATA(str)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008791 + kind*start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 kind,
8793 end-start, ch, direction);
8794 if (!result)
8795 return -1;
8796 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8797}
8798
Alexander Belopolsky40018472011-02-26 01:02:56 +00008799static int
8800tailmatch(PyUnicodeObject *self,
8801 PyUnicodeObject *substring,
8802 Py_ssize_t start,
8803 Py_ssize_t end,
8804 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 int kind_self;
8807 int kind_sub;
8808 void *data_self;
8809 void *data_sub;
8810 Py_ssize_t offset;
8811 Py_ssize_t i;
8812 Py_ssize_t end_sub;
8813
8814 if (PyUnicode_READY(self) == -1 ||
8815 PyUnicode_READY(substring) == -1)
8816 return 0;
8817
8818 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819 return 1;
8820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8822 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 kind_self = PyUnicode_KIND(self);
8827 data_self = PyUnicode_DATA(self);
8828 kind_sub = PyUnicode_KIND(substring);
8829 data_sub = PyUnicode_DATA(substring);
8830 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8831
8832 if (direction > 0)
8833 offset = end;
8834 else
8835 offset = start;
8836
8837 if (PyUnicode_READ(kind_self, data_self, offset) ==
8838 PyUnicode_READ(kind_sub, data_sub, 0) &&
8839 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8840 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8841 /* If both are of the same kind, memcmp is sufficient */
8842 if (kind_self == kind_sub) {
8843 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008844 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 data_sub,
8846 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008847 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 }
8849 /* otherwise we have to compare each character by first accesing it */
8850 else {
8851 /* We do not need to compare 0 and len(substring)-1 because
8852 the if statement above ensured already that they are equal
8853 when we end up here. */
8854 // TODO: honor direction and do a forward or backwards search
8855 for (i = 1; i < end_sub; ++i) {
8856 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8857 PyUnicode_READ(kind_sub, data_sub, i))
8858 return 0;
8859 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 }
8863
8864 return 0;
8865}
8866
Alexander Belopolsky40018472011-02-26 01:02:56 +00008867Py_ssize_t
8868PyUnicode_Tailmatch(PyObject *str,
8869 PyObject *substr,
8870 Py_ssize_t start,
8871 Py_ssize_t end,
8872 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008874 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008875
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 str = PyUnicode_FromObject(str);
8877 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 substr = PyUnicode_FromObject(substr);
8880 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 Py_DECREF(str);
8882 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 }
Tim Petersced69f82003-09-16 20:30:58 +00008884
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 (PyUnicodeObject *)substr,
8887 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 Py_DECREF(str);
8889 Py_DECREF(substr);
8890 return result;
8891}
8892
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893/* Apply fixfct filter to the Unicode object self and return a
8894 reference to the modified object */
8895
Alexander Belopolsky40018472011-02-26 01:02:56 +00008896static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008897fixup(PyObject *self,
8898 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 PyObject *u;
8901 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 if (PyUnicode_READY(self) == -1)
8904 return NULL;
8905 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8906 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8907 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008912 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 /* fix functions return the new maximum character in a string,
8915 if the kind of the resulting unicode object does not change,
8916 everything is fine. Otherwise we need to change the string kind
8917 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008918 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 if (maxchar_new == 0)
8920 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8921 else if (maxchar_new <= 127)
8922 maxchar_new = 127;
8923 else if (maxchar_new <= 255)
8924 maxchar_new = 255;
8925 else if (maxchar_new <= 65535)
8926 maxchar_new = 65535;
8927 else
8928 maxchar_new = 1114111; /* 0x10ffff */
8929
8930 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 /* fixfct should return TRUE if it modified the buffer. If
8932 FALSE, return a reference to the original buffer instead
8933 (to save space, not time) */
8934 Py_INCREF(self);
8935 Py_DECREF(u);
8936 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 else if (maxchar_new == maxchar_old) {
8939 return u;
8940 }
8941 else {
8942 /* In case the maximum character changed, we need to
8943 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008944 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 if (v == NULL) {
8946 Py_DECREF(u);
8947 return NULL;
8948 }
8949 if (maxchar_new > maxchar_old) {
8950 /* If the maxchar increased so that the kind changed, not all
8951 characters are representable anymore and we need to fix the
8952 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008953 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008954 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8956 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008957 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008958 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960
8961 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008962 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 return v;
8964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965}
8966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008968fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 /* No need to call PyUnicode_READY(self) because this function is only
8971 called as a callback from fixup() which does it already. */
8972 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8973 const int kind = PyUnicode_KIND(self);
8974 void *data = PyUnicode_DATA(self);
8975 int touched = 0;
8976 Py_UCS4 maxchar = 0;
8977 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 for (i = 0; i < len; ++i) {
8980 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8981 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8982 if (up != ch) {
8983 if (up > maxchar)
8984 maxchar = up;
8985 PyUnicode_WRITE(kind, data, i, up);
8986 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 else if (ch > maxchar)
8989 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 }
8991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 if (touched)
8993 return maxchar;
8994 else
8995 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996}
8997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008999fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9002 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9003 const int kind = PyUnicode_KIND(self);
9004 void *data = PyUnicode_DATA(self);
9005 int touched = 0;
9006 Py_UCS4 maxchar = 0;
9007 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 for(i = 0; i < len; ++i) {
9010 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9011 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9012 if (lo != ch) {
9013 if (lo > maxchar)
9014 maxchar = lo;
9015 PyUnicode_WRITE(kind, data, i, lo);
9016 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 else if (ch > maxchar)
9019 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 }
9021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 if (touched)
9023 return maxchar;
9024 else
9025 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026}
9027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009029fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9032 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9033 const int kind = PyUnicode_KIND(self);
9034 void *data = PyUnicode_DATA(self);
9035 int touched = 0;
9036 Py_UCS4 maxchar = 0;
9037 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 for(i = 0; i < len; ++i) {
9040 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9041 Py_UCS4 nu = 0;
9042
9043 if (Py_UNICODE_ISUPPER(ch))
9044 nu = Py_UNICODE_TOLOWER(ch);
9045 else if (Py_UNICODE_ISLOWER(ch))
9046 nu = Py_UNICODE_TOUPPER(ch);
9047
9048 if (nu != 0) {
9049 if (nu > maxchar)
9050 maxchar = nu;
9051 PyUnicode_WRITE(kind, data, i, nu);
9052 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 else if (ch > maxchar)
9055 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 }
9057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 if (touched)
9059 return maxchar;
9060 else
9061 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062}
9063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009065fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9068 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9069 const int kind = PyUnicode_KIND(self);
9070 void *data = PyUnicode_DATA(self);
9071 int touched = 0;
9072 Py_UCS4 maxchar = 0;
9073 Py_ssize_t i = 0;
9074 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009075
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009076 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078
9079 ch = PyUnicode_READ(kind, data, i);
9080 if (!Py_UNICODE_ISUPPER(ch)) {
9081 maxchar = Py_UNICODE_TOUPPER(ch);
9082 PyUnicode_WRITE(kind, data, i, maxchar);
9083 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 ++i;
9086 for(; i < len; ++i) {
9087 ch = PyUnicode_READ(kind, data, i);
9088 if (!Py_UNICODE_ISLOWER(ch)) {
9089 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9090 if (lo > maxchar)
9091 maxchar = lo;
9092 PyUnicode_WRITE(kind, data, i, lo);
9093 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 else if (ch > maxchar)
9096 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098
9099 if (touched)
9100 return maxchar;
9101 else
9102 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103}
9104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009106fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9109 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9110 const int kind = PyUnicode_KIND(self);
9111 void *data = PyUnicode_DATA(self);
9112 Py_UCS4 maxchar = 0;
9113 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 int previous_is_cased;
9115
9116 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 if (len == 1) {
9118 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9119 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9120 if (ti != ch) {
9121 PyUnicode_WRITE(kind, data, i, ti);
9122 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 }
9124 else
9125 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 for(; i < len; ++i) {
9129 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9130 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009131
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009135 nu = Py_UNICODE_TOTITLE(ch);
9136
9137 if (nu > maxchar)
9138 maxchar = nu;
9139 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009140
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 if (Py_UNICODE_ISLOWER(ch) ||
9142 Py_UNICODE_ISUPPER(ch) ||
9143 Py_UNICODE_ISTITLE(ch))
9144 previous_is_cased = 1;
9145 else
9146 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149}
9150
Tim Peters8ce9f162004-08-27 01:49:32 +00009151PyObject *
9152PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009155 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009157 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009158 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9159 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009160 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009162 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009164 int use_memcpy;
9165 unsigned char *res_data = NULL, *sep_data = NULL;
9166 PyObject *last_obj;
9167 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168
Tim Peters05eba1f2004-08-27 21:32:02 +00009169 fseq = PySequence_Fast(seq, "");
9170 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009171 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009172 }
9173
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009174 /* NOTE: the following code can't call back into Python code,
9175 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009176 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009177
Tim Peters05eba1f2004-08-27 21:32:02 +00009178 seqlen = PySequence_Fast_GET_SIZE(fseq);
9179 /* If empty sequence, return u"". */
9180 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009181 Py_DECREF(fseq);
9182 Py_INCREF(unicode_empty);
9183 res = unicode_empty;
9184 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009185 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009186
Tim Peters05eba1f2004-08-27 21:32:02 +00009187 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009188 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009189 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009190 if (seqlen == 1) {
9191 if (PyUnicode_CheckExact(items[0])) {
9192 res = items[0];
9193 Py_INCREF(res);
9194 Py_DECREF(fseq);
9195 return res;
9196 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009197 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009198 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009199 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009200 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009201 /* Set up sep and seplen */
9202 if (separator == NULL) {
9203 /* fall back to a blank space separator */
9204 sep = PyUnicode_FromOrdinal(' ');
9205 if (!sep)
9206 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009207 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009208 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009209 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009210 else {
9211 if (!PyUnicode_Check(separator)) {
9212 PyErr_Format(PyExc_TypeError,
9213 "separator: expected str instance,"
9214 " %.80s found",
9215 Py_TYPE(separator)->tp_name);
9216 goto onError;
9217 }
9218 if (PyUnicode_READY(separator))
9219 goto onError;
9220 sep = separator;
9221 seplen = PyUnicode_GET_LENGTH(separator);
9222 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9223 /* inc refcount to keep this code path symmetric with the
9224 above case of a blank separator */
9225 Py_INCREF(sep);
9226 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009227 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009228 }
9229
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009230 /* There are at least two things to join, or else we have a subclass
9231 * of str in the sequence.
9232 * Do a pre-pass to figure out the total amount of space we'll
9233 * need (sz), and see whether all argument are strings.
9234 */
9235 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009236#ifdef Py_DEBUG
9237 use_memcpy = 0;
9238#else
9239 use_memcpy = 1;
9240#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009241 for (i = 0; i < seqlen; i++) {
9242 const Py_ssize_t old_sz = sz;
9243 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 if (!PyUnicode_Check(item)) {
9245 PyErr_Format(PyExc_TypeError,
9246 "sequence item %zd: expected str instance,"
9247 " %.80s found",
9248 i, Py_TYPE(item)->tp_name);
9249 goto onError;
9250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 if (PyUnicode_READY(item) == -1)
9252 goto onError;
9253 sz += PyUnicode_GET_LENGTH(item);
9254 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009255 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009256 if (i != 0)
9257 sz += seplen;
9258 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9259 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009261 goto onError;
9262 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009263 if (use_memcpy && last_obj != NULL) {
9264 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9265 use_memcpy = 0;
9266 }
9267 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009268 }
Tim Petersced69f82003-09-16 20:30:58 +00009269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009271 if (res == NULL)
9272 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009273
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009274 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009275#ifdef Py_DEBUG
9276 use_memcpy = 0;
9277#else
9278 if (use_memcpy) {
9279 res_data = PyUnicode_1BYTE_DATA(res);
9280 kind = PyUnicode_KIND(res);
9281 if (seplen != 0)
9282 sep_data = PyUnicode_1BYTE_DATA(sep);
9283 }
9284#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009286 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009287 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009289 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009290 if (use_memcpy) {
9291 Py_MEMCPY(res_data,
9292 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009293 kind * seplen);
9294 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009295 }
9296 else {
9297 copy_characters(res, res_offset, sep, 0, seplen);
9298 res_offset += seplen;
9299 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009300 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009301 itemlen = PyUnicode_GET_LENGTH(item);
9302 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009303 if (use_memcpy) {
9304 Py_MEMCPY(res_data,
9305 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009306 kind * itemlen);
9307 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009308 }
9309 else {
9310 copy_characters(res, res_offset, item, 0, itemlen);
9311 res_offset += itemlen;
9312 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009313 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009314 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009315 if (use_memcpy)
9316 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009317 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009318 else
9319 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009320
Tim Peters05eba1f2004-08-27 21:32:02 +00009321 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009323 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009327 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009329 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330 return NULL;
9331}
9332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333#define FILL(kind, data, value, start, length) \
9334 do { \
9335 Py_ssize_t i_ = 0; \
9336 assert(kind != PyUnicode_WCHAR_KIND); \
9337 switch ((kind)) { \
9338 case PyUnicode_1BYTE_KIND: { \
9339 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9340 memset(to_, (unsigned char)value, length); \
9341 break; \
9342 } \
9343 case PyUnicode_2BYTE_KIND: { \
9344 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9345 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9346 break; \
9347 } \
9348 default: { \
9349 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9350 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9351 break; \
9352 } \
9353 } \
9354 } while (0)
9355
Victor Stinner9310abb2011-10-05 00:59:23 +02009356static PyObject *
9357pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009358 Py_ssize_t left,
9359 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 PyObject *u;
9363 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009364 int kind;
9365 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366
9367 if (left < 0)
9368 left = 0;
9369 if (right < 0)
9370 right = 0;
9371
Tim Peters7a29bd52001-09-12 03:03:31 +00009372 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 Py_INCREF(self);
9374 return self;
9375 }
9376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9378 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009379 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9380 return NULL;
9381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9383 if (fill > maxchar)
9384 maxchar = fill;
9385 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009386 if (!u)
9387 return NULL;
9388
9389 kind = PyUnicode_KIND(u);
9390 data = PyUnicode_DATA(u);
9391 if (left)
9392 FILL(kind, data, fill, 0, left);
9393 if (right)
9394 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009395 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009396 assert(_PyUnicode_CheckConsistency(u, 1));
9397 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400
Alexander Belopolsky40018472011-02-26 01:02:56 +00009401PyObject *
9402PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405
9406 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 switch(PyUnicode_KIND(string)) {
9411 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009412 if (PyUnicode_IS_ASCII(string))
9413 list = asciilib_splitlines(
9414 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9415 PyUnicode_GET_LENGTH(string), keepends);
9416 else
9417 list = ucs1lib_splitlines(
9418 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9419 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 break;
9421 case PyUnicode_2BYTE_KIND:
9422 list = ucs2lib_splitlines(
9423 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9424 PyUnicode_GET_LENGTH(string), keepends);
9425 break;
9426 case PyUnicode_4BYTE_KIND:
9427 list = ucs4lib_splitlines(
9428 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9429 PyUnicode_GET_LENGTH(string), keepends);
9430 break;
9431 default:
9432 assert(0);
9433 list = 0;
9434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 Py_DECREF(string);
9436 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437}
9438
Alexander Belopolsky40018472011-02-26 01:02:56 +00009439static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009440split(PyObject *self,
9441 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009442 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 int kind1, kind2, kind;
9445 void *buf1, *buf2;
9446 Py_ssize_t len1, len2;
9447 PyObject* out;
9448
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009450 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 if (PyUnicode_READY(self) == -1)
9453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 if (substring == NULL)
9456 switch(PyUnicode_KIND(self)) {
9457 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009458 if (PyUnicode_IS_ASCII(self))
9459 return asciilib_split_whitespace(
9460 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9461 PyUnicode_GET_LENGTH(self), maxcount
9462 );
9463 else
9464 return ucs1lib_split_whitespace(
9465 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9466 PyUnicode_GET_LENGTH(self), maxcount
9467 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 case PyUnicode_2BYTE_KIND:
9469 return ucs2lib_split_whitespace(
9470 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9471 PyUnicode_GET_LENGTH(self), maxcount
9472 );
9473 case PyUnicode_4BYTE_KIND:
9474 return ucs4lib_split_whitespace(
9475 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9476 PyUnicode_GET_LENGTH(self), maxcount
9477 );
9478 default:
9479 assert(0);
9480 return NULL;
9481 }
9482
9483 if (PyUnicode_READY(substring) == -1)
9484 return NULL;
9485
9486 kind1 = PyUnicode_KIND(self);
9487 kind2 = PyUnicode_KIND(substring);
9488 kind = kind1 > kind2 ? kind1 : kind2;
9489 buf1 = PyUnicode_DATA(self);
9490 buf2 = PyUnicode_DATA(substring);
9491 if (kind1 != kind)
9492 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9493 if (!buf1)
9494 return NULL;
9495 if (kind2 != kind)
9496 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9497 if (!buf2) {
9498 if (kind1 != kind) PyMem_Free(buf1);
9499 return NULL;
9500 }
9501 len1 = PyUnicode_GET_LENGTH(self);
9502 len2 = PyUnicode_GET_LENGTH(substring);
9503
9504 switch(kind) {
9505 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009506 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9507 out = asciilib_split(
9508 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9509 else
9510 out = ucs1lib_split(
9511 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 break;
9513 case PyUnicode_2BYTE_KIND:
9514 out = ucs2lib_split(
9515 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9516 break;
9517 case PyUnicode_4BYTE_KIND:
9518 out = ucs4lib_split(
9519 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9520 break;
9521 default:
9522 out = NULL;
9523 }
9524 if (kind1 != kind)
9525 PyMem_Free(buf1);
9526 if (kind2 != kind)
9527 PyMem_Free(buf2);
9528 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529}
9530
Alexander Belopolsky40018472011-02-26 01:02:56 +00009531static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009532rsplit(PyObject *self,
9533 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009534 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 int kind1, kind2, kind;
9537 void *buf1, *buf2;
9538 Py_ssize_t len1, len2;
9539 PyObject* out;
9540
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009541 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009542 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 if (PyUnicode_READY(self) == -1)
9545 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 if (substring == NULL)
9548 switch(PyUnicode_KIND(self)) {
9549 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009550 if (PyUnicode_IS_ASCII(self))
9551 return asciilib_rsplit_whitespace(
9552 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9553 PyUnicode_GET_LENGTH(self), maxcount
9554 );
9555 else
9556 return ucs1lib_rsplit_whitespace(
9557 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9558 PyUnicode_GET_LENGTH(self), maxcount
9559 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 case PyUnicode_2BYTE_KIND:
9561 return ucs2lib_rsplit_whitespace(
9562 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9563 PyUnicode_GET_LENGTH(self), maxcount
9564 );
9565 case PyUnicode_4BYTE_KIND:
9566 return ucs4lib_rsplit_whitespace(
9567 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9568 PyUnicode_GET_LENGTH(self), maxcount
9569 );
9570 default:
9571 assert(0);
9572 return NULL;
9573 }
9574
9575 if (PyUnicode_READY(substring) == -1)
9576 return NULL;
9577
9578 kind1 = PyUnicode_KIND(self);
9579 kind2 = PyUnicode_KIND(substring);
9580 kind = kind1 > kind2 ? kind1 : kind2;
9581 buf1 = PyUnicode_DATA(self);
9582 buf2 = PyUnicode_DATA(substring);
9583 if (kind1 != kind)
9584 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9585 if (!buf1)
9586 return NULL;
9587 if (kind2 != kind)
9588 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9589 if (!buf2) {
9590 if (kind1 != kind) PyMem_Free(buf1);
9591 return NULL;
9592 }
9593 len1 = PyUnicode_GET_LENGTH(self);
9594 len2 = PyUnicode_GET_LENGTH(substring);
9595
9596 switch(kind) {
9597 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009598 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9599 out = asciilib_rsplit(
9600 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9601 else
9602 out = ucs1lib_rsplit(
9603 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 break;
9605 case PyUnicode_2BYTE_KIND:
9606 out = ucs2lib_rsplit(
9607 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9608 break;
9609 case PyUnicode_4BYTE_KIND:
9610 out = ucs4lib_rsplit(
9611 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9612 break;
9613 default:
9614 out = NULL;
9615 }
9616 if (kind1 != kind)
9617 PyMem_Free(buf1);
9618 if (kind2 != kind)
9619 PyMem_Free(buf2);
9620 return out;
9621}
9622
9623static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009624anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9625 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626{
9627 switch(kind) {
9628 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009629 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9630 return asciilib_find(buf1, len1, buf2, len2, offset);
9631 else
9632 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 case PyUnicode_2BYTE_KIND:
9634 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9635 case PyUnicode_4BYTE_KIND:
9636 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9637 }
9638 assert(0);
9639 return -1;
9640}
9641
9642static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009643anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9644 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645{
9646 switch(kind) {
9647 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009648 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9649 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9650 else
9651 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 case PyUnicode_2BYTE_KIND:
9653 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9654 case PyUnicode_4BYTE_KIND:
9655 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9656 }
9657 assert(0);
9658 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009659}
9660
Alexander Belopolsky40018472011-02-26 01:02:56 +00009661static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662replace(PyObject *self, PyObject *str1,
9663 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 PyObject *u;
9666 char *sbuf = PyUnicode_DATA(self);
9667 char *buf1 = PyUnicode_DATA(str1);
9668 char *buf2 = PyUnicode_DATA(str2);
9669 int srelease = 0, release1 = 0, release2 = 0;
9670 int skind = PyUnicode_KIND(self);
9671 int kind1 = PyUnicode_KIND(str1);
9672 int kind2 = PyUnicode_KIND(str2);
9673 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9674 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9675 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676
9677 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009680 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681
Victor Stinner59de0ee2011-10-07 10:01:28 +02009682 if (str1 == str2)
9683 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 if (skind < kind1)
9685 /* substring too wide to be present */
9686 goto nothing;
9687
9688 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009689 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009690 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009692 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009694 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 Py_UCS4 u1, u2, maxchar;
9696 int mayshrink, rkind;
9697 u1 = PyUnicode_READ_CHAR(str1, 0);
9698 if (!findchar(sbuf, PyUnicode_KIND(self),
9699 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009700 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 u2 = PyUnicode_READ_CHAR(str2, 0);
9702 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9703 /* Replacing u1 with u2 may cause a maxchar reduction in the
9704 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 if (u2 > maxchar) {
9706 maxchar = u2;
9707 mayshrink = 0;
9708 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009709 else
9710 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009712 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009714 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 rkind = PyUnicode_KIND(u);
9716 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9717 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009718 if (--maxcount < 0)
9719 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009723 unicode_adjust_maxchar(&u);
9724 if (u == NULL)
9725 goto error;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 int rkind = skind;
9729 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009730 PyObject *rstr;
9731 Py_UCS4 maxchar;
9732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 if (kind1 < rkind) {
9734 /* widen substring */
9735 buf1 = _PyUnicode_AsKind(str1, rkind);
9736 if (!buf1) goto error;
9737 release1 = 1;
9738 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009739 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009740 if (i < 0)
9741 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 if (rkind > kind2) {
9743 /* widen replacement */
9744 buf2 = _PyUnicode_AsKind(str2, rkind);
9745 if (!buf2) goto error;
9746 release2 = 1;
9747 }
9748 else if (rkind < kind2) {
9749 /* widen self and buf1 */
9750 rkind = kind2;
9751 if (release1) PyMem_Free(buf1);
9752 sbuf = _PyUnicode_AsKind(self, rkind);
9753 if (!sbuf) goto error;
9754 srelease = 1;
9755 buf1 = _PyUnicode_AsKind(str1, rkind);
9756 if (!buf1) goto error;
9757 release1 = 1;
9758 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009759 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9760 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9761 rstr = PyUnicode_New(slen, maxchar);
9762 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009764 res = PyUnicode_DATA(rstr);
9765
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009766 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009767 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009768 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009770 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009772
9773 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009774 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009775 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009776 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009777 if (i == -1)
9778 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009779 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009781 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784
Victor Stinner25a4b292011-10-06 12:31:55 +02009785 u = rstr;
9786 unicode_adjust_maxchar(&u);
9787 if (!u)
9788 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 Py_ssize_t n, i, j, ires;
9793 Py_ssize_t product, new_size;
9794 int rkind = skind;
Victor Stinner25a4b292011-10-06 12:31:55 +02009795 PyObject *rstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009797 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 if (kind1 < rkind) {
9800 buf1 = _PyUnicode_AsKind(str1, rkind);
9801 if (!buf1) goto error;
9802 release1 = 1;
9803 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009804 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009805 if (n == 0)
9806 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 if (kind2 < rkind) {
9808 buf2 = _PyUnicode_AsKind(str2, rkind);
9809 if (!buf2) goto error;
9810 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 else if (kind2 > rkind) {
9813 rkind = kind2;
9814 sbuf = _PyUnicode_AsKind(self, rkind);
9815 if (!sbuf) goto error;
9816 srelease = 1;
9817 if (release1) PyMem_Free(buf1);
9818 buf1 = _PyUnicode_AsKind(str1, rkind);
9819 if (!buf1) goto error;
9820 release1 = 1;
9821 }
9822 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9823 PyUnicode_GET_LENGTH(str1))); */
9824 product = n * (len2-len1);
9825 if ((product / (len2-len1)) != n) {
9826 PyErr_SetString(PyExc_OverflowError,
9827 "replace string is too long");
9828 goto error;
9829 }
9830 new_size = slen + product;
9831 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9832 PyErr_SetString(PyExc_OverflowError,
9833 "replace string is too long");
9834 goto error;
9835 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009836 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9837 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9838 rstr = PyUnicode_New(new_size, maxchar);
9839 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009841 res = PyUnicode_DATA(rstr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 ires = i = 0;
9843 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009844 while (n-- > 0) {
9845 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009846 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009847 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009848 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009849 if (j == -1)
9850 break;
9851 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009852 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009853 memcpy(res + rkind * ires,
9854 sbuf + rkind * i,
9855 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009857 }
9858 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009860 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009862 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009868 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009869 memcpy(res + rkind * ires,
9870 sbuf + rkind * i,
9871 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009872 } else {
9873 /* interleave */
9874 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009875 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009877 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009879 if (--n <= 0)
9880 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009881 memcpy(res + rkind * ires,
9882 sbuf + rkind * i,
9883 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 ires++;
9885 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009886 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009887 memcpy(res + rkind * ires,
9888 sbuf + rkind * i,
9889 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009890 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009891 u = rstr;
9892 unicode_adjust_maxchar(&u);
9893 if (u == NULL)
9894 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 if (srelease)
9897 PyMem_FREE(sbuf);
9898 if (release1)
9899 PyMem_FREE(buf1);
9900 if (release2)
9901 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009902 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009904
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009906 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (srelease)
9908 PyMem_FREE(sbuf);
9909 if (release1)
9910 PyMem_FREE(buf1);
9911 if (release2)
9912 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009913 if (PyUnicode_CheckExact(self)) {
9914 Py_INCREF(self);
9915 return (PyObject *) self;
9916 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009917 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 error:
9919 if (srelease && sbuf)
9920 PyMem_FREE(sbuf);
9921 if (release1 && buf1)
9922 PyMem_FREE(buf1);
9923 if (release2 && buf2)
9924 PyMem_FREE(buf2);
9925 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926}
9927
9928/* --- Unicode Object Methods --------------------------------------------- */
9929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009930PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932\n\
9933Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009934characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935
9936static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009937unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939 return fixup(self, fixtitle);
9940}
9941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009942PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944\n\
9945Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009946have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947
9948static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009949unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951 return fixup(self, fixcapitalize);
9952}
9953
9954#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009955PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009956 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957\n\
9958Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009959normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960
9961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009962unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963{
9964 PyObject *list;
9965 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009966 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968 /* Split into words */
9969 list = split(self, NULL, -1);
9970 if (!list)
9971 return NULL;
9972
9973 /* Capitalize each word */
9974 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9975 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009976 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977 if (item == NULL)
9978 goto onError;
9979 Py_DECREF(PyList_GET_ITEM(list, i));
9980 PyList_SET_ITEM(list, i, item);
9981 }
9982
9983 /* Join the words to form a new string */
9984 item = PyUnicode_Join(NULL, list);
9985
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987 Py_DECREF(list);
9988 return (PyObject *)item;
9989}
9990#endif
9991
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009992/* Argument converter. Coerces to a single unicode character */
9993
9994static int
9995convert_uc(PyObject *obj, void *addr)
9996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009998 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009999
Benjamin Peterson14339b62009-01-31 16:36:08 +000010000 uniobj = PyUnicode_FromObject(obj);
10001 if (uniobj == NULL) {
10002 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010004 return 0;
10005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010007 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010009 Py_DECREF(uniobj);
10010 return 0;
10011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010013 Py_DECREF(uniobj);
10014 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010015}
10016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010017PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010018 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010020Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010021done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022
10023static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010024unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010026 Py_ssize_t marg, left;
10027 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 Py_UCS4 fillchar = ' ';
10029
Victor Stinnere9a29352011-10-01 02:14:59 +020010030 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032
Victor Stinnere9a29352011-10-01 02:14:59 +020010033 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034 return NULL;
10035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037 Py_INCREF(self);
10038 return (PyObject*) self;
10039 }
10040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042 left = marg / 2 + (marg & width & 1);
10043
Victor Stinner9310abb2011-10-05 00:59:23 +020010044 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045}
10046
Marc-André Lemburge5034372000-08-08 08:04:29 +000010047#if 0
10048
10049/* This code should go into some future Unicode collation support
10050 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +000010051 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +000010052
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010053/* speedy UTF-16 code point order comparison */
10054/* gleaned from: */
10055/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
10056
Marc-André Lemburge12896e2000-07-07 17:51:08 +000010057static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010058{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010059 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +000010060 0, 0, 0, 0, 0, 0, 0, 0,
10061 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +000010062 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010063};
10064
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065static int
10066unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10067{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010068 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010069
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070 Py_UNICODE *s1 = str1->str;
10071 Py_UNICODE *s2 = str2->str;
10072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 len1 = str1->_base._base.length;
10074 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +000010075
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +000010077 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010078
10079 c1 = *s1++;
10080 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +000010081
Benjamin Peterson29060642009-01-31 22:14:21 +000010082 if (c1 > (1<<11) * 26)
10083 c1 += utf16Fixup[c1>>11];
10084 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010085 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010086 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +000010087
10088 if (c1 != c2)
10089 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +000010090
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010091 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092 }
10093
10094 return (len1 < len2) ? -1 : (len1 != len2);
10095}
10096
Marc-André Lemburge5034372000-08-08 08:04:29 +000010097#else
10098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099/* This function assumes that str1 and str2 are readied by the caller. */
10100
Marc-André Lemburge5034372000-08-08 08:04:29 +000010101static int
10102unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 int kind1, kind2;
10105 void *data1, *data2;
10106 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 kind1 = PyUnicode_KIND(str1);
10109 kind2 = PyUnicode_KIND(str2);
10110 data1 = PyUnicode_DATA(str1);
10111 data2 = PyUnicode_DATA(str2);
10112 len1 = PyUnicode_GET_LENGTH(str1);
10113 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 for (i = 0; i < len1 && i < len2; ++i) {
10116 Py_UCS4 c1, c2;
10117 c1 = PyUnicode_READ(kind1, data1, i);
10118 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010119
10120 if (c1 != c2)
10121 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010122 }
10123
10124 return (len1 < len2) ? -1 : (len1 != len2);
10125}
10126
10127#endif
10128
Alexander Belopolsky40018472011-02-26 01:02:56 +000010129int
10130PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10133 if (PyUnicode_READY(left) == -1 ||
10134 PyUnicode_READY(right) == -1)
10135 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010136 return unicode_compare((PyUnicodeObject *)left,
10137 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010139 PyErr_Format(PyExc_TypeError,
10140 "Can't compare %.100s and %.100s",
10141 left->ob_type->tp_name,
10142 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143 return -1;
10144}
10145
Martin v. Löwis5b222132007-06-10 09:51:05 +000010146int
10147PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10148{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 Py_ssize_t i;
10150 int kind;
10151 void *data;
10152 Py_UCS4 chr;
10153
Victor Stinner910337b2011-10-03 03:20:16 +020010154 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 if (PyUnicode_READY(uni) == -1)
10156 return -1;
10157 kind = PyUnicode_KIND(uni);
10158 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010159 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10161 if (chr != str[i])
10162 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010163 /* This check keeps Python strings that end in '\0' from comparing equal
10164 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010166 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010167 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010168 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010169 return 0;
10170}
10171
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010172
Benjamin Peterson29060642009-01-31 22:14:21 +000010173#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010174 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010175
Alexander Belopolsky40018472011-02-26 01:02:56 +000010176PyObject *
10177PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010178{
10179 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010180
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010181 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10182 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (PyUnicode_READY(left) == -1 ||
10184 PyUnicode_READY(right) == -1)
10185 return NULL;
10186 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10187 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010188 if (op == Py_EQ) {
10189 Py_INCREF(Py_False);
10190 return Py_False;
10191 }
10192 if (op == Py_NE) {
10193 Py_INCREF(Py_True);
10194 return Py_True;
10195 }
10196 }
10197 if (left == right)
10198 result = 0;
10199 else
10200 result = unicode_compare((PyUnicodeObject *)left,
10201 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010202
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010203 /* Convert the return value to a Boolean */
10204 switch (op) {
10205 case Py_EQ:
10206 v = TEST_COND(result == 0);
10207 break;
10208 case Py_NE:
10209 v = TEST_COND(result != 0);
10210 break;
10211 case Py_LE:
10212 v = TEST_COND(result <= 0);
10213 break;
10214 case Py_GE:
10215 v = TEST_COND(result >= 0);
10216 break;
10217 case Py_LT:
10218 v = TEST_COND(result == -1);
10219 break;
10220 case Py_GT:
10221 v = TEST_COND(result == 1);
10222 break;
10223 default:
10224 PyErr_BadArgument();
10225 return NULL;
10226 }
10227 Py_INCREF(v);
10228 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010229 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010230
Brian Curtindfc80e32011-08-10 20:28:54 -050010231 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010232}
10233
Alexander Belopolsky40018472011-02-26 01:02:56 +000010234int
10235PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010236{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010237 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 int kind1, kind2, kind;
10239 void *buf1, *buf2;
10240 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010241 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010242
10243 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244 sub = PyUnicode_FromObject(element);
10245 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010246 PyErr_Format(PyExc_TypeError,
10247 "'in <string>' requires string as left operand, not %s",
10248 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010249 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (PyUnicode_READY(sub) == -1)
10252 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010253
Thomas Wouters477c8d52006-05-27 19:21:47 +000010254 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010255 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010256 Py_DECREF(sub);
10257 return -1;
10258 }
10259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 kind1 = PyUnicode_KIND(str);
10261 kind2 = PyUnicode_KIND(sub);
10262 kind = kind1 > kind2 ? kind1 : kind2;
10263 buf1 = PyUnicode_DATA(str);
10264 buf2 = PyUnicode_DATA(sub);
10265 if (kind1 != kind)
10266 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10267 if (!buf1) {
10268 Py_DECREF(sub);
10269 return -1;
10270 }
10271 if (kind2 != kind)
10272 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10273 if (!buf2) {
10274 Py_DECREF(sub);
10275 if (kind1 != kind) PyMem_Free(buf1);
10276 return -1;
10277 }
10278 len1 = PyUnicode_GET_LENGTH(str);
10279 len2 = PyUnicode_GET_LENGTH(sub);
10280
10281 switch(kind) {
10282 case PyUnicode_1BYTE_KIND:
10283 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10284 break;
10285 case PyUnicode_2BYTE_KIND:
10286 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10287 break;
10288 case PyUnicode_4BYTE_KIND:
10289 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10290 break;
10291 default:
10292 result = -1;
10293 assert(0);
10294 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295
10296 Py_DECREF(str);
10297 Py_DECREF(sub);
10298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 if (kind1 != kind)
10300 PyMem_Free(buf1);
10301 if (kind2 != kind)
10302 PyMem_Free(buf2);
10303
Guido van Rossum403d68b2000-03-13 15:55:09 +000010304 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010305}
10306
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307/* Concat to string or Unicode object giving a new Unicode object. */
10308
Alexander Belopolsky40018472011-02-26 01:02:56 +000010309PyObject *
10310PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 PyObject *u = NULL, *v = NULL, *w;
10313 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314
10315 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010318 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010321 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322
10323 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010324 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010325 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010328 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010329 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331 }
10332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010334 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 w = PyUnicode_New(
10338 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10339 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010341 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010342 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10343 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344 Py_DECREF(u);
10345 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010346 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348
Benjamin Peterson29060642009-01-31 22:14:21 +000010349 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350 Py_XDECREF(u);
10351 Py_XDECREF(v);
10352 return NULL;
10353}
10354
Victor Stinnerb0923652011-10-04 01:17:31 +020010355static void
10356unicode_append_inplace(PyObject **p_left, PyObject *right)
10357{
10358 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010359
10360 assert(PyUnicode_IS_READY(*p_left));
10361 assert(PyUnicode_IS_READY(right));
10362
10363 left_len = PyUnicode_GET_LENGTH(*p_left);
10364 right_len = PyUnicode_GET_LENGTH(right);
10365 if (left_len > PY_SSIZE_T_MAX - right_len) {
10366 PyErr_SetString(PyExc_OverflowError,
10367 "strings are too large to concat");
10368 goto error;
10369 }
10370 new_len = left_len + right_len;
10371
10372 /* Now we own the last reference to 'left', so we can resize it
10373 * in-place.
10374 */
10375 if (unicode_resize(p_left, new_len) != 0) {
10376 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10377 * deallocated so it cannot be put back into
10378 * 'variable'. The MemoryError is raised when there
10379 * is no value in 'variable', which might (very
10380 * remotely) be a cause of incompatibilities.
10381 */
10382 goto error;
10383 }
10384 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010385 copy_characters(*p_left, left_len, right, 0, right_len);
10386 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010387 return;
10388
10389error:
10390 Py_DECREF(*p_left);
10391 *p_left = NULL;
10392}
10393
Walter Dörwald1ab83302007-05-18 17:15:44 +000010394void
Victor Stinner23e56682011-10-03 03:54:37 +020010395PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010396{
Victor Stinner23e56682011-10-03 03:54:37 +020010397 PyObject *left, *res;
10398
10399 if (p_left == NULL) {
10400 if (!PyErr_Occurred())
10401 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010402 return;
10403 }
Victor Stinner23e56682011-10-03 03:54:37 +020010404 left = *p_left;
10405 if (right == NULL || !PyUnicode_Check(left)) {
10406 if (!PyErr_Occurred())
10407 PyErr_BadInternalCall();
10408 goto error;
10409 }
10410
Victor Stinnere1335c72011-10-04 20:53:03 +020010411 if (PyUnicode_READY(left))
10412 goto error;
10413 if (PyUnicode_READY(right))
10414 goto error;
10415
Victor Stinner23e56682011-10-03 03:54:37 +020010416 if (PyUnicode_CheckExact(left) && left != unicode_empty
10417 && PyUnicode_CheckExact(right) && right != unicode_empty
10418 && unicode_resizable(left)
10419 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10420 || _PyUnicode_WSTR(left) != NULL))
10421 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010422 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10423 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010424 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010425 not so different than duplicating the string. */
10426 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010427 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010428 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010429 if (p_left != NULL)
10430 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010431 return;
10432 }
10433 }
10434
10435 res = PyUnicode_Concat(left, right);
10436 if (res == NULL)
10437 goto error;
10438 Py_DECREF(left);
10439 *p_left = res;
10440 return;
10441
10442error:
10443 Py_DECREF(*p_left);
10444 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010445}
10446
10447void
10448PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10449{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010450 PyUnicode_Append(pleft, right);
10451 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010452}
10453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010454PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010455 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010458string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010459interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460
10461static PyObject *
10462unicode_count(PyUnicodeObject *self, PyObject *args)
10463{
10464 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010465 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010466 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 int kind1, kind2, kind;
10469 void *buf1, *buf2;
10470 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471
Jesus Ceaac451502011-04-20 17:09:23 +020010472 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10473 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010474 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 kind1 = PyUnicode_KIND(self);
10477 kind2 = PyUnicode_KIND(substring);
10478 kind = kind1 > kind2 ? kind1 : kind2;
10479 buf1 = PyUnicode_DATA(self);
10480 buf2 = PyUnicode_DATA(substring);
10481 if (kind1 != kind)
10482 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10483 if (!buf1) {
10484 Py_DECREF(substring);
10485 return NULL;
10486 }
10487 if (kind2 != kind)
10488 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10489 if (!buf2) {
10490 Py_DECREF(substring);
10491 if (kind1 != kind) PyMem_Free(buf1);
10492 return NULL;
10493 }
10494 len1 = PyUnicode_GET_LENGTH(self);
10495 len2 = PyUnicode_GET_LENGTH(substring);
10496
10497 ADJUST_INDICES(start, end, len1);
10498 switch(kind) {
10499 case PyUnicode_1BYTE_KIND:
10500 iresult = ucs1lib_count(
10501 ((Py_UCS1*)buf1) + start, end - start,
10502 buf2, len2, PY_SSIZE_T_MAX
10503 );
10504 break;
10505 case PyUnicode_2BYTE_KIND:
10506 iresult = ucs2lib_count(
10507 ((Py_UCS2*)buf1) + start, end - start,
10508 buf2, len2, PY_SSIZE_T_MAX
10509 );
10510 break;
10511 case PyUnicode_4BYTE_KIND:
10512 iresult = ucs4lib_count(
10513 ((Py_UCS4*)buf1) + start, end - start,
10514 buf2, len2, PY_SSIZE_T_MAX
10515 );
10516 break;
10517 default:
10518 assert(0); iresult = 0;
10519 }
10520
10521 result = PyLong_FromSsize_t(iresult);
10522
10523 if (kind1 != kind)
10524 PyMem_Free(buf1);
10525 if (kind2 != kind)
10526 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527
10528 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010529
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530 return result;
10531}
10532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010533PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010534 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010536Encode S using the codec registered for encoding. Default encoding\n\
10537is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010538handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010539a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10540'xmlcharrefreplace' as well as any other name registered with\n\
10541codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542
10543static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010544unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010546 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547 char *encoding = NULL;
10548 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010549
Benjamin Peterson308d6372009-09-18 21:42:35 +000010550 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10551 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010553 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010554}
10555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010556PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558\n\
10559Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010560If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561
10562static PyObject*
10563unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10564{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010565 Py_ssize_t i, j, line_pos, src_len, incr;
10566 Py_UCS4 ch;
10567 PyObject *u;
10568 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010570 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010571 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
10573 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575
Antoine Pitrou22425222011-10-04 19:10:51 +020010576 if (PyUnicode_READY(self) == -1)
10577 return NULL;
10578
Thomas Wouters7e474022000-07-16 12:04:32 +000010579 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010580 src_len = PyUnicode_GET_LENGTH(self);
10581 i = j = line_pos = 0;
10582 kind = PyUnicode_KIND(self);
10583 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010584 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010585 for (; i < src_len; i++) {
10586 ch = PyUnicode_READ(kind, src_data, i);
10587 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010588 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010589 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010590 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010592 goto overflow;
10593 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010594 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010595 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010598 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010599 goto overflow;
10600 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010602 if (ch == '\n' || ch == '\r')
10603 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010605 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010606 if (!found && PyUnicode_CheckExact(self)) {
10607 Py_INCREF((PyObject *) self);
10608 return (PyObject *) self;
10609 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010610
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010612 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 if (!u)
10614 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010615 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616
Antoine Pitroue71d5742011-10-04 15:55:09 +020010617 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618
Antoine Pitroue71d5742011-10-04 15:55:09 +020010619 for (; i < src_len; i++) {
10620 ch = PyUnicode_READ(kind, src_data, i);
10621 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010622 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010623 incr = tabsize - (line_pos % tabsize);
10624 line_pos += incr;
10625 while (incr--) {
10626 PyUnicode_WRITE(kind, dest_data, j, ' ');
10627 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010628 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010629 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010630 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010631 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010632 line_pos++;
10633 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010634 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010635 if (ch == '\n' || ch == '\r')
10636 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010638 }
10639 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010640#ifndef DONT_MAKE_RESULT_READY
10641 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 Py_DECREF(u);
10643 return NULL;
10644 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010645#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010646 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010648
Antoine Pitroue71d5742011-10-04 15:55:09 +020010649 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010650 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652}
10653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010654PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010655 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010656\n\
10657Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010658such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659arguments start and end are interpreted as in slice notation.\n\
10660\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010661Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662
10663static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010665{
Jesus Ceaac451502011-04-20 17:09:23 +020010666 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010667 Py_ssize_t start;
10668 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670
Jesus Ceaac451502011-04-20 17:09:23 +020010671 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10672 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 if (PyUnicode_READY(self) == -1)
10676 return NULL;
10677 if (PyUnicode_READY(substring) == -1)
10678 return NULL;
10679
10680 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010681 asciilib_find_slice, ucs1lib_find_slice,
10682 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685
10686 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (result == -2)
10689 return NULL;
10690
Christian Heimes217cfd12007-12-02 14:31:20 +000010691 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692}
10693
10694static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010695unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010697 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10698 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701}
10702
Guido van Rossumc2504932007-09-18 19:42:40 +000010703/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010704 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010705static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010706unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707{
Guido van Rossumc2504932007-09-18 19:42:40 +000010708 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010709 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 if (_PyUnicode_HASH(self) != -1)
10712 return _PyUnicode_HASH(self);
10713 if (PyUnicode_READY(self) == -1)
10714 return -1;
10715 len = PyUnicode_GET_LENGTH(self);
10716
10717 /* The hash function as a macro, gets expanded three times below. */
10718#define HASH(P) \
10719 x = (Py_uhash_t)*P << 7; \
10720 while (--len >= 0) \
10721 x = (1000003*x) ^ (Py_uhash_t)*P++;
10722
10723 switch (PyUnicode_KIND(self)) {
10724 case PyUnicode_1BYTE_KIND: {
10725 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10726 HASH(c);
10727 break;
10728 }
10729 case PyUnicode_2BYTE_KIND: {
10730 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10731 HASH(s);
10732 break;
10733 }
10734 default: {
10735 Py_UCS4 *l;
10736 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10737 "Impossible switch case in unicode_hash");
10738 l = PyUnicode_4BYTE_DATA(self);
10739 HASH(l);
10740 break;
10741 }
10742 }
10743 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10744
Guido van Rossumc2504932007-09-18 19:42:40 +000010745 if (x == -1)
10746 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010748 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010753 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010755Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756
10757static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010760 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010761 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010762 Py_ssize_t start;
10763 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764
Jesus Ceaac451502011-04-20 17:09:23 +020010765 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10766 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 if (PyUnicode_READY(self) == -1)
10770 return NULL;
10771 if (PyUnicode_READY(substring) == -1)
10772 return NULL;
10773
10774 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010775 asciilib_find_slice, ucs1lib_find_slice,
10776 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010778 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779
10780 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (result == -2)
10783 return NULL;
10784
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 if (result < 0) {
10786 PyErr_SetString(PyExc_ValueError, "substring not found");
10787 return NULL;
10788 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010789
Christian Heimes217cfd12007-12-02 14:31:20 +000010790 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791}
10792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010793PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010796Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010797at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798
10799static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010800unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 Py_ssize_t i, length;
10803 int kind;
10804 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805 int cased;
10806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 if (PyUnicode_READY(self) == -1)
10808 return NULL;
10809 length = PyUnicode_GET_LENGTH(self);
10810 kind = PyUnicode_KIND(self);
10811 data = PyUnicode_DATA(self);
10812
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 if (length == 1)
10815 return PyBool_FromLong(
10816 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010818 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010820 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 for (i = 0; i < length; i++) {
10824 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010825
Benjamin Peterson29060642009-01-31 22:14:21 +000010826 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10827 return PyBool_FromLong(0);
10828 else if (!cased && Py_UNICODE_ISLOWER(ch))
10829 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010831 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832}
10833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010834PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010835 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010837Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010838at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
10840static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010841unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 Py_ssize_t i, length;
10844 int kind;
10845 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846 int cased;
10847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (PyUnicode_READY(self) == -1)
10849 return NULL;
10850 length = PyUnicode_GET_LENGTH(self);
10851 kind = PyUnicode_KIND(self);
10852 data = PyUnicode_DATA(self);
10853
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (length == 1)
10856 return PyBool_FromLong(
10857 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010859 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010862
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 for (i = 0; i < length; i++) {
10865 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010866
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10868 return PyBool_FromLong(0);
10869 else if (!cased && Py_UNICODE_ISUPPER(ch))
10870 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010872 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873}
10874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010875PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010878Return True if S is a titlecased string and there is at least one\n\
10879character in S, i.e. upper- and titlecase characters may only\n\
10880follow uncased characters and lowercase characters only cased ones.\n\
10881Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
10883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010884unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 Py_ssize_t i, length;
10887 int kind;
10888 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 int cased, previous_is_cased;
10890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (PyUnicode_READY(self) == -1)
10892 return NULL;
10893 length = PyUnicode_GET_LENGTH(self);
10894 kind = PyUnicode_KIND(self);
10895 data = PyUnicode_DATA(self);
10896
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 if (length == 1) {
10899 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10900 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10901 (Py_UNICODE_ISUPPER(ch) != 0));
10902 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010904 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010907
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908 cased = 0;
10909 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 for (i = 0; i < length; i++) {
10911 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010912
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10914 if (previous_is_cased)
10915 return PyBool_FromLong(0);
10916 previous_is_cased = 1;
10917 cased = 1;
10918 }
10919 else if (Py_UNICODE_ISLOWER(ch)) {
10920 if (!previous_is_cased)
10921 return PyBool_FromLong(0);
10922 previous_is_cased = 1;
10923 cased = 1;
10924 }
10925 else
10926 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010928 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929}
10930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010931PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010934Return True if all characters in S are whitespace\n\
10935and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936
10937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010938unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 Py_ssize_t i, length;
10941 int kind;
10942 void *data;
10943
10944 if (PyUnicode_READY(self) == -1)
10945 return NULL;
10946 length = PyUnicode_GET_LENGTH(self);
10947 kind = PyUnicode_KIND(self);
10948 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 if (length == 1)
10952 return PyBool_FromLong(
10953 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010955 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 for (i = 0; i < length; i++) {
10960 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010961 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010964 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965}
10966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010967PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010968 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010969\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010970Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010971and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010972
10973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010974unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 Py_ssize_t i, length;
10977 int kind;
10978 void *data;
10979
10980 if (PyUnicode_READY(self) == -1)
10981 return NULL;
10982 length = PyUnicode_GET_LENGTH(self);
10983 kind = PyUnicode_KIND(self);
10984 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010985
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010986 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 if (length == 1)
10988 return PyBool_FromLong(
10989 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010990
10991 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 for (i = 0; i < length; i++) {
10996 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010998 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010999 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011000}
11001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011004\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011005Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011006and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011007
11008static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011009unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 int kind;
11012 void *data;
11013 Py_ssize_t len, i;
11014
11015 if (PyUnicode_READY(self) == -1)
11016 return NULL;
11017
11018 kind = PyUnicode_KIND(self);
11019 data = PyUnicode_DATA(self);
11020 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011021
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011022 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 if (len == 1) {
11024 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11025 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11026 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011027
11028 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 for (i = 0; i < len; i++) {
11033 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011034 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011036 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011037 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011038}
11039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011040PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011041 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011043Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011044False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
11046static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011047unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 Py_ssize_t i, length;
11050 int kind;
11051 void *data;
11052
11053 if (PyUnicode_READY(self) == -1)
11054 return NULL;
11055 length = PyUnicode_GET_LENGTH(self);
11056 kind = PyUnicode_KIND(self);
11057 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 if (length == 1)
11061 return PyBool_FromLong(
11062 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011064 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 for (i = 0; i < length; i++) {
11069 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011070 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011072 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073}
11074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011075PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011076 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011078Return True if all characters in S are digits\n\
11079and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080
11081static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011082unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 Py_ssize_t i, length;
11085 int kind;
11086 void *data;
11087
11088 if (PyUnicode_READY(self) == -1)
11089 return NULL;
11090 length = PyUnicode_GET_LENGTH(self);
11091 kind = PyUnicode_KIND(self);
11092 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 if (length == 1) {
11096 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11097 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011100 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 for (i = 0; i < length; i++) {
11105 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011106 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011108 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109}
11110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011111PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011112 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011114Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011115False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116
11117static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011118unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 Py_ssize_t i, length;
11121 int kind;
11122 void *data;
11123
11124 if (PyUnicode_READY(self) == -1)
11125 return NULL;
11126 length = PyUnicode_GET_LENGTH(self);
11127 kind = PyUnicode_KIND(self);
11128 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 if (length == 1)
11132 return PyBool_FromLong(
11133 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011135 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011137 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 for (i = 0; i < length; i++) {
11140 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011143 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144}
11145
Martin v. Löwis47383402007-08-15 07:32:56 +000011146int
11147PyUnicode_IsIdentifier(PyObject *self)
11148{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 int kind;
11150 void *data;
11151 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011152 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 if (PyUnicode_READY(self) == -1) {
11155 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 }
11158
11159 /* Special case for empty strings */
11160 if (PyUnicode_GET_LENGTH(self) == 0)
11161 return 0;
11162 kind = PyUnicode_KIND(self);
11163 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011164
11165 /* PEP 3131 says that the first character must be in
11166 XID_Start and subsequent characters in XID_Continue,
11167 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011168 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011169 letters, digits, underscore). However, given the current
11170 definition of XID_Start and XID_Continue, it is sufficient
11171 to check just for these, except that _ must be allowed
11172 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011174 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011175 return 0;
11176
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011177 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011180 return 1;
11181}
11182
11183PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011185\n\
11186Return True if S is a valid identifier according\n\
11187to the language definition.");
11188
11189static PyObject*
11190unicode_isidentifier(PyObject *self)
11191{
11192 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11193}
11194
Georg Brandl559e5d72008-06-11 18:37:52 +000011195PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011196 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011197\n\
11198Return True if all characters in S are considered\n\
11199printable in repr() or S is empty, False otherwise.");
11200
11201static PyObject*
11202unicode_isprintable(PyObject *self)
11203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 Py_ssize_t i, length;
11205 int kind;
11206 void *data;
11207
11208 if (PyUnicode_READY(self) == -1)
11209 return NULL;
11210 length = PyUnicode_GET_LENGTH(self);
11211 kind = PyUnicode_KIND(self);
11212 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011213
11214 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 if (length == 1)
11216 return PyBool_FromLong(
11217 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 for (i = 0; i < length; i++) {
11220 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011221 Py_RETURN_FALSE;
11222 }
11223 }
11224 Py_RETURN_TRUE;
11225}
11226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011228 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229\n\
11230Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011231iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011234unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011236 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237}
11238
Martin v. Löwis18e16552006-02-15 17:27:45 +000011239static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240unicode_length(PyUnicodeObject *self)
11241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 if (PyUnicode_READY(self) == -1)
11243 return -1;
11244 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245}
11246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011247PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011250Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011251done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
11253static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011254unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011256 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 Py_UCS4 fillchar = ' ';
11258
11259 if (PyUnicode_READY(self) == -1)
11260 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011261
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011262 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263 return NULL;
11264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 Py_INCREF(self);
11267 return (PyObject*) self;
11268 }
11269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271}
11272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011273PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277
11278static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011279unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 return fixup(self, fixlower);
11282}
11283
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011284#define LEFTSTRIP 0
11285#define RIGHTSTRIP 1
11286#define BOTHSTRIP 2
11287
11288/* Arrays indexed by above */
11289static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11290
11291#define STRIPNAME(i) (stripformat[i]+3)
11292
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011293/* externally visible for str.strip(unicode) */
11294PyObject *
11295_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 void *data;
11298 int kind;
11299 Py_ssize_t i, j, len;
11300 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11303 return NULL;
11304
11305 kind = PyUnicode_KIND(self);
11306 data = PyUnicode_DATA(self);
11307 len = PyUnicode_GET_LENGTH(self);
11308 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11309 PyUnicode_DATA(sepobj),
11310 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011311
Benjamin Peterson14339b62009-01-31 16:36:08 +000011312 i = 0;
11313 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 while (i < len &&
11315 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011316 i++;
11317 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011318 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011319
Benjamin Peterson14339b62009-01-31 16:36:08 +000011320 j = len;
11321 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 do {
11323 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 } while (j >= i &&
11325 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011327 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011328
Victor Stinner12bab6d2011-10-01 01:53:49 +020011329 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330}
11331
11332PyObject*
11333PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11334{
11335 unsigned char *data;
11336 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011337 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338
Victor Stinnerde636f32011-10-01 03:55:54 +020011339 if (PyUnicode_READY(self) == -1)
11340 return NULL;
11341
11342 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11343
Victor Stinner12bab6d2011-10-01 01:53:49 +020011344 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011346 if (PyUnicode_CheckExact(self)) {
11347 Py_INCREF(self);
11348 return self;
11349 }
11350 else
11351 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 }
11353
Victor Stinner12bab6d2011-10-01 01:53:49 +020011354 length = end - start;
11355 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011356 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357
Victor Stinnerde636f32011-10-01 03:55:54 +020011358 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011359 PyErr_SetString(PyExc_IndexError, "string index out of range");
11360 return NULL;
11361 }
11362
Victor Stinnerb9275c12011-10-05 14:01:42 +020011363 if (PyUnicode_IS_ASCII(self)) {
11364 kind = PyUnicode_KIND(self);
11365 data = PyUnicode_1BYTE_DATA(self);
11366 return unicode_fromascii(data + start, length);
11367 }
11368 else {
11369 kind = PyUnicode_KIND(self);
11370 data = PyUnicode_1BYTE_DATA(self);
11371 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011372 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011373 length);
11374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
11377static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011378do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 int kind;
11381 void *data;
11382 Py_ssize_t len, i, j;
11383
11384 if (PyUnicode_READY(self) == -1)
11385 return NULL;
11386
11387 kind = PyUnicode_KIND(self);
11388 data = PyUnicode_DATA(self);
11389 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011390
Benjamin Peterson14339b62009-01-31 16:36:08 +000011391 i = 0;
11392 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011394 i++;
11395 }
11396 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011397
Benjamin Peterson14339b62009-01-31 16:36:08 +000011398 j = len;
11399 if (striptype != LEFTSTRIP) {
11400 do {
11401 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011403 j++;
11404 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011405
Victor Stinner12bab6d2011-10-01 01:53:49 +020011406 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407}
11408
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011409
11410static PyObject *
11411do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11412{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011413 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011414
Benjamin Peterson14339b62009-01-31 16:36:08 +000011415 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11416 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011417
Benjamin Peterson14339b62009-01-31 16:36:08 +000011418 if (sep != NULL && sep != Py_None) {
11419 if (PyUnicode_Check(sep))
11420 return _PyUnicode_XStrip(self, striptype, sep);
11421 else {
11422 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 "%s arg must be None or str",
11424 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011425 return NULL;
11426 }
11427 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011428
Benjamin Peterson14339b62009-01-31 16:36:08 +000011429 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011430}
11431
11432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011435\n\
11436Return a copy of the string S with leading and trailing\n\
11437whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011438If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011439
11440static PyObject *
11441unicode_strip(PyUnicodeObject *self, PyObject *args)
11442{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011443 if (PyTuple_GET_SIZE(args) == 0)
11444 return do_strip(self, BOTHSTRIP); /* Common case */
11445 else
11446 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011447}
11448
11449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011450PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011452\n\
11453Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011454If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011455
11456static PyObject *
11457unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11458{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011459 if (PyTuple_GET_SIZE(args) == 0)
11460 return do_strip(self, LEFTSTRIP); /* Common case */
11461 else
11462 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011463}
11464
11465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011466PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011468\n\
11469Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011470If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011471
11472static PyObject *
11473unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11474{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011475 if (PyTuple_GET_SIZE(args) == 0)
11476 return do_strip(self, RIGHTSTRIP); /* Common case */
11477 else
11478 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011479}
11480
11481
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011483unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484{
11485 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Georg Brandl222de0f2009-04-12 12:01:50 +000011488 if (len < 1) {
11489 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011490 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Tim Peters7a29bd52001-09-12 03:03:31 +000011493 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 /* no repeat, return original string */
11495 Py_INCREF(str);
11496 return (PyObject*) str;
11497 }
Tim Peters8f422462000-09-09 06:13:41 +000011498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (PyUnicode_READY(str) == -1)
11500 return NULL;
11501
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011502 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011503 PyErr_SetString(PyExc_OverflowError,
11504 "repeated string is too long");
11505 return NULL;
11506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 if (!u)
11511 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011512 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 if (PyUnicode_GET_LENGTH(str) == 1) {
11515 const int kind = PyUnicode_KIND(str);
11516 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11517 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011518 if (kind == PyUnicode_1BYTE_KIND)
11519 memset(to, (unsigned char)fill_char, len);
11520 else {
11521 for (n = 0; n < len; ++n)
11522 PyUnicode_WRITE(kind, to, n, fill_char);
11523 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 }
11525 else {
11526 /* number of characters copied this far */
11527 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011528 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 char *to = (char *) PyUnicode_DATA(u);
11530 Py_MEMCPY(to, PyUnicode_DATA(str),
11531 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 n = (done <= nchars-done) ? done : nchars-done;
11534 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011535 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537 }
11538
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011539 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540 return (PyObject*) u;
11541}
11542
Alexander Belopolsky40018472011-02-26 01:02:56 +000011543PyObject *
11544PyUnicode_Replace(PyObject *obj,
11545 PyObject *subobj,
11546 PyObject *replobj,
11547 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548{
11549 PyObject *self;
11550 PyObject *str1;
11551 PyObject *str2;
11552 PyObject *result;
11553
11554 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011555 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011558 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 Py_DECREF(self);
11560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561 }
11562 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011563 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 Py_DECREF(self);
11565 Py_DECREF(str1);
11566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569 Py_DECREF(self);
11570 Py_DECREF(str1);
11571 Py_DECREF(str2);
11572 return result;
11573}
11574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011575PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011576 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577\n\
11578Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011579old replaced by new. If the optional argument count is\n\
11580given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
11582static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 PyObject *str1;
11586 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011587 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588 PyObject *result;
11589
Martin v. Löwis18e16552006-02-15 17:27:45 +000011590 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 str1 = PyUnicode_FromObject(str1);
11595 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11596 return NULL;
11597 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011598 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 Py_DECREF(str1);
11600 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
11603 result = replace(self, str1, str2, maxcount);
11604
11605 Py_DECREF(str1);
11606 Py_DECREF(str2);
11607 return result;
11608}
11609
Alexander Belopolsky40018472011-02-26 01:02:56 +000011610static PyObject *
11611unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011613 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 Py_ssize_t isize;
11615 Py_ssize_t osize, squote, dquote, i, o;
11616 Py_UCS4 max, quote;
11617 int ikind, okind;
11618 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011621 return NULL;
11622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623 isize = PyUnicode_GET_LENGTH(unicode);
11624 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 /* Compute length of output, quote characters, and
11627 maximum character */
11628 osize = 2; /* quotes */
11629 max = 127;
11630 squote = dquote = 0;
11631 ikind = PyUnicode_KIND(unicode);
11632 for (i = 0; i < isize; i++) {
11633 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11634 switch (ch) {
11635 case '\'': squote++; osize++; break;
11636 case '"': dquote++; osize++; break;
11637 case '\\': case '\t': case '\r': case '\n':
11638 osize += 2; break;
11639 default:
11640 /* Fast-path ASCII */
11641 if (ch < ' ' || ch == 0x7f)
11642 osize += 4; /* \xHH */
11643 else if (ch < 0x7f)
11644 osize++;
11645 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11646 osize++;
11647 max = ch > max ? ch : max;
11648 }
11649 else if (ch < 0x100)
11650 osize += 4; /* \xHH */
11651 else if (ch < 0x10000)
11652 osize += 6; /* \uHHHH */
11653 else
11654 osize += 10; /* \uHHHHHHHH */
11655 }
11656 }
11657
11658 quote = '\'';
11659 if (squote) {
11660 if (dquote)
11661 /* Both squote and dquote present. Use squote,
11662 and escape them */
11663 osize += squote;
11664 else
11665 quote = '"';
11666 }
11667
11668 repr = PyUnicode_New(osize, max);
11669 if (repr == NULL)
11670 return NULL;
11671 okind = PyUnicode_KIND(repr);
11672 odata = PyUnicode_DATA(repr);
11673
11674 PyUnicode_WRITE(okind, odata, 0, quote);
11675 PyUnicode_WRITE(okind, odata, osize-1, quote);
11676
11677 for (i = 0, o = 1; i < isize; i++) {
11678 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011679
11680 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 if ((ch == quote) || (ch == '\\')) {
11682 PyUnicode_WRITE(okind, odata, o++, '\\');
11683 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011684 continue;
11685 }
11686
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011688 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 PyUnicode_WRITE(okind, odata, o++, '\\');
11690 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011691 }
11692 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 PyUnicode_WRITE(okind, odata, o++, '\\');
11694 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011695 }
11696 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 PyUnicode_WRITE(okind, odata, o++, '\\');
11698 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011699 }
11700
11701 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011702 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 PyUnicode_WRITE(okind, odata, o++, '\\');
11704 PyUnicode_WRITE(okind, odata, o++, 'x');
11705 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11706 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011707 }
11708
Georg Brandl559e5d72008-06-11 18:37:52 +000011709 /* Copy ASCII characters as-is */
11710 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011712 }
11713
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011715 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011717 (categories Z* and C* except ASCII space)
11718 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011720 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 if (ch <= 0xff) {
11722 PyUnicode_WRITE(okind, odata, o++, '\\');
11723 PyUnicode_WRITE(okind, odata, o++, 'x');
11724 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11725 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011726 }
11727 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 else if (ch >= 0x10000) {
11729 PyUnicode_WRITE(okind, odata, o++, '\\');
11730 PyUnicode_WRITE(okind, odata, o++, 'U');
11731 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11732 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11733 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11734 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11735 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11736 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11737 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11738 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011739 }
11740 /* Map 16-bit characters to '\uxxxx' */
11741 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 PyUnicode_WRITE(okind, odata, o++, '\\');
11743 PyUnicode_WRITE(okind, odata, o++, 'u');
11744 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11745 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11746 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11747 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011748 }
11749 }
11750 /* Copy characters as-is */
11751 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011753 }
11754 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011757 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011758 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011761PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763\n\
11764Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011765such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766arguments start and end are interpreted as in slice notation.\n\
11767\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011768Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769
11770static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772{
Jesus Ceaac451502011-04-20 17:09:23 +020011773 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011774 Py_ssize_t start;
11775 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011776 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
Jesus Ceaac451502011-04-20 17:09:23 +020011778 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11779 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (PyUnicode_READY(self) == -1)
11783 return NULL;
11784 if (PyUnicode_READY(substring) == -1)
11785 return NULL;
11786
11787 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011788 asciilib_rfind_slice, ucs1lib_rfind_slice,
11789 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011791 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792
11793 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 if (result == -2)
11796 return NULL;
11797
Christian Heimes217cfd12007-12-02 14:31:20 +000011798 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799}
11800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011804Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
11806static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808{
Jesus Ceaac451502011-04-20 17:09:23 +020011809 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011810 Py_ssize_t start;
11811 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011812 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
Jesus Ceaac451502011-04-20 17:09:23 +020011814 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11815 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 if (PyUnicode_READY(self) == -1)
11819 return NULL;
11820 if (PyUnicode_READY(substring) == -1)
11821 return NULL;
11822
11823 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011824 asciilib_rfind_slice, ucs1lib_rfind_slice,
11825 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011827 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
11829 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (result == -2)
11832 return NULL;
11833
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 if (result < 0) {
11835 PyErr_SetString(PyExc_ValueError, "substring not found");
11836 return NULL;
11837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838
Christian Heimes217cfd12007-12-02 14:31:20 +000011839 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840}
11841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011842PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011845Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011846done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847
11848static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011849unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011851 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 Py_UCS4 fillchar = ' ';
11853
Victor Stinnere9a29352011-10-01 02:14:59 +020011854 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011856
Victor Stinnere9a29352011-10-01 02:14:59 +020011857 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 return NULL;
11859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 Py_INCREF(self);
11862 return (PyObject*) self;
11863 }
11864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866}
11867
Alexander Belopolsky40018472011-02-26 01:02:56 +000011868PyObject *
11869PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870{
11871 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011872
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 s = PyUnicode_FromObject(s);
11874 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011875 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 if (sep != NULL) {
11877 sep = PyUnicode_FromObject(sep);
11878 if (sep == NULL) {
11879 Py_DECREF(s);
11880 return NULL;
11881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 }
11883
Victor Stinner9310abb2011-10-05 00:59:23 +020011884 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
11886 Py_DECREF(s);
11887 Py_XDECREF(sep);
11888 return result;
11889}
11890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011891PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011892 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893\n\
11894Return a list of the words in S, using sep as the\n\
11895delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011896splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011897whitespace string is a separator and empty strings are\n\
11898removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899
11900static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011901unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902{
11903 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011904 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
Martin v. Löwis18e16552006-02-15 17:27:45 +000011906 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 return NULL;
11908
11909 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011912 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915}
11916
Thomas Wouters477c8d52006-05-27 19:21:47 +000011917PyObject *
11918PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11919{
11920 PyObject* str_obj;
11921 PyObject* sep_obj;
11922 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 int kind1, kind2, kind;
11924 void *buf1 = NULL, *buf2 = NULL;
11925 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011926
11927 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011928 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011930 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011932 Py_DECREF(str_obj);
11933 return NULL;
11934 }
11935
Victor Stinner14f8f022011-10-05 20:58:25 +020011936 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011938 kind = Py_MAX(kind1, kind2);
11939 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011941 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 if (!buf1)
11943 goto onError;
11944 buf2 = PyUnicode_DATA(sep_obj);
11945 if (kind2 != kind)
11946 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11947 if (!buf2)
11948 goto onError;
11949 len1 = PyUnicode_GET_LENGTH(str_obj);
11950 len2 = PyUnicode_GET_LENGTH(sep_obj);
11951
Victor Stinner14f8f022011-10-05 20:58:25 +020011952 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011954 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11955 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11956 else
11957 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 break;
11959 case PyUnicode_2BYTE_KIND:
11960 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11961 break;
11962 case PyUnicode_4BYTE_KIND:
11963 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11964 break;
11965 default:
11966 assert(0);
11967 out = 0;
11968 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011969
11970 Py_DECREF(sep_obj);
11971 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (kind1 != kind)
11973 PyMem_Free(buf1);
11974 if (kind2 != kind)
11975 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011976
11977 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 onError:
11979 Py_DECREF(sep_obj);
11980 Py_DECREF(str_obj);
11981 if (kind1 != kind && buf1)
11982 PyMem_Free(buf1);
11983 if (kind2 != kind && buf2)
11984 PyMem_Free(buf2);
11985 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011986}
11987
11988
11989PyObject *
11990PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11991{
11992 PyObject* str_obj;
11993 PyObject* sep_obj;
11994 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 int kind1, kind2, kind;
11996 void *buf1 = NULL, *buf2 = NULL;
11997 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011998
11999 str_obj = PyUnicode_FromObject(str_in);
12000 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012002 sep_obj = PyUnicode_FromObject(sep_in);
12003 if (!sep_obj) {
12004 Py_DECREF(str_obj);
12005 return NULL;
12006 }
12007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 kind1 = PyUnicode_KIND(str_in);
12009 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012010 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 buf1 = PyUnicode_DATA(str_in);
12012 if (kind1 != kind)
12013 buf1 = _PyUnicode_AsKind(str_in, kind);
12014 if (!buf1)
12015 goto onError;
12016 buf2 = PyUnicode_DATA(sep_obj);
12017 if (kind2 != kind)
12018 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12019 if (!buf2)
12020 goto onError;
12021 len1 = PyUnicode_GET_LENGTH(str_obj);
12022 len2 = PyUnicode_GET_LENGTH(sep_obj);
12023
12024 switch(PyUnicode_KIND(str_in)) {
12025 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012026 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12027 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12028 else
12029 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 break;
12031 case PyUnicode_2BYTE_KIND:
12032 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12033 break;
12034 case PyUnicode_4BYTE_KIND:
12035 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12036 break;
12037 default:
12038 assert(0);
12039 out = 0;
12040 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012041
12042 Py_DECREF(sep_obj);
12043 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 if (kind1 != kind)
12045 PyMem_Free(buf1);
12046 if (kind2 != kind)
12047 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012048
12049 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 onError:
12051 Py_DECREF(sep_obj);
12052 Py_DECREF(str_obj);
12053 if (kind1 != kind && buf1)
12054 PyMem_Free(buf1);
12055 if (kind2 != kind && buf2)
12056 PyMem_Free(buf2);
12057 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012058}
12059
12060PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012062\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012063Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012064the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012065found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012066
12067static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012068unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012069{
Victor Stinner9310abb2011-10-05 00:59:23 +020012070 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012071}
12072
12073PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012074 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012075\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012076Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012077the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012078separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012079
12080static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012081unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012082{
Victor Stinner9310abb2011-10-05 00:59:23 +020012083 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012084}
12085
Alexander Belopolsky40018472011-02-26 01:02:56 +000012086PyObject *
12087PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012088{
12089 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012091 s = PyUnicode_FromObject(s);
12092 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012093 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 if (sep != NULL) {
12095 sep = PyUnicode_FromObject(sep);
12096 if (sep == NULL) {
12097 Py_DECREF(s);
12098 return NULL;
12099 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012100 }
12101
Victor Stinner9310abb2011-10-05 00:59:23 +020012102 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012103
12104 Py_DECREF(s);
12105 Py_XDECREF(sep);
12106 return result;
12107}
12108
12109PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012111\n\
12112Return a list of the words in S, using sep as the\n\
12113delimiter string, starting at the end of the string and\n\
12114working to the front. If maxsplit is given, at most maxsplit\n\
12115splits are done. If sep is not specified, any whitespace string\n\
12116is a separator.");
12117
12118static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012119unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012120{
12121 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012122 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012123
Martin v. Löwis18e16552006-02-15 17:27:45 +000012124 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012125 return NULL;
12126
12127 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012129 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012130 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012131 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012132 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012133}
12134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012135PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137\n\
12138Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012139Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012140is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141
12142static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012143unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012145 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012146 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012148 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12149 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150 return NULL;
12151
Guido van Rossum86662912000-04-11 15:38:46 +000012152 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153}
12154
12155static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012156PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157{
Walter Dörwald346737f2007-05-31 10:44:43 +000012158 if (PyUnicode_CheckExact(self)) {
12159 Py_INCREF(self);
12160 return self;
12161 } else
12162 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012163 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164}
12165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012166PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012167 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168\n\
12169Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012170and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
12172static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012173unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175 return fixup(self, fixswapcase);
12176}
12177
Georg Brandlceee0772007-11-27 23:48:05 +000012178PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012179 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012180\n\
12181Return a translation table usable for str.translate().\n\
12182If there is only one argument, it must be a dictionary mapping Unicode\n\
12183ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012184Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012185If there are two arguments, they must be strings of equal length, and\n\
12186in the resulting dictionary, each character in x will be mapped to the\n\
12187character at the same position in y. If there is a third argument, it\n\
12188must be a string, whose characters will be mapped to None in the result.");
12189
12190static PyObject*
12191unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12192{
12193 PyObject *x, *y = NULL, *z = NULL;
12194 PyObject *new = NULL, *key, *value;
12195 Py_ssize_t i = 0;
12196 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012197
Georg Brandlceee0772007-11-27 23:48:05 +000012198 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12199 return NULL;
12200 new = PyDict_New();
12201 if (!new)
12202 return NULL;
12203 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 int x_kind, y_kind, z_kind;
12205 void *x_data, *y_data, *z_data;
12206
Georg Brandlceee0772007-11-27 23:48:05 +000012207 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012208 if (!PyUnicode_Check(x)) {
12209 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12210 "be a string if there is a second argument");
12211 goto err;
12212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012214 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12215 "arguments must have equal length");
12216 goto err;
12217 }
12218 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 x_kind = PyUnicode_KIND(x);
12220 y_kind = PyUnicode_KIND(y);
12221 x_data = PyUnicode_DATA(x);
12222 y_data = PyUnicode_DATA(y);
12223 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12224 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12225 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012226 if (!key || !value)
12227 goto err;
12228 res = PyDict_SetItem(new, key, value);
12229 Py_DECREF(key);
12230 Py_DECREF(value);
12231 if (res < 0)
12232 goto err;
12233 }
12234 /* create entries for deleting chars in z */
12235 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 z_kind = PyUnicode_KIND(z);
12237 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012238 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012240 if (!key)
12241 goto err;
12242 res = PyDict_SetItem(new, key, Py_None);
12243 Py_DECREF(key);
12244 if (res < 0)
12245 goto err;
12246 }
12247 }
12248 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 int kind;
12250 void *data;
12251
Georg Brandlceee0772007-11-27 23:48:05 +000012252 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012253 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012254 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12255 "to maketrans it must be a dict");
12256 goto err;
12257 }
12258 /* copy entries into the new dict, converting string keys to int keys */
12259 while (PyDict_Next(x, &i, &key, &value)) {
12260 if (PyUnicode_Check(key)) {
12261 /* convert string keys to integer keys */
12262 PyObject *newkey;
12263 if (PyUnicode_GET_SIZE(key) != 1) {
12264 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12265 "table must be of length 1");
12266 goto err;
12267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 kind = PyUnicode_KIND(key);
12269 data = PyUnicode_DATA(key);
12270 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012271 if (!newkey)
12272 goto err;
12273 res = PyDict_SetItem(new, newkey, value);
12274 Py_DECREF(newkey);
12275 if (res < 0)
12276 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012277 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012278 /* just keep integer keys */
12279 if (PyDict_SetItem(new, key, value) < 0)
12280 goto err;
12281 } else {
12282 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12283 "be strings or integers");
12284 goto err;
12285 }
12286 }
12287 }
12288 return new;
12289 err:
12290 Py_DECREF(new);
12291 return NULL;
12292}
12293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012294PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296\n\
12297Return a copy of the string S, where all characters have been mapped\n\
12298through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012299Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012300Unmapped characters are left untouched. Characters mapped to None\n\
12301are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
12303static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307}
12308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012309PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012310 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012312Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313
12314static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012315unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317 return fixup(self, fixupper);
12318}
12319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012320PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012323Pad a numeric string S with zeros on the left, to fill a field\n\
12324of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325
12326static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012327unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012329 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012330 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012331 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 int kind;
12333 void *data;
12334 Py_UCS4 chr;
12335
12336 if (PyUnicode_READY(self) == -1)
12337 return NULL;
12338
Martin v. Löwis18e16552006-02-15 17:27:45 +000012339 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340 return NULL;
12341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012343 if (PyUnicode_CheckExact(self)) {
12344 Py_INCREF(self);
12345 return (PyObject*) self;
12346 }
12347 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012348 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349 }
12350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352
12353 u = pad(self, fill, 0, '0');
12354
Walter Dörwald068325e2002-04-15 13:36:47 +000012355 if (u == NULL)
12356 return NULL;
12357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 kind = PyUnicode_KIND(u);
12359 data = PyUnicode_DATA(u);
12360 chr = PyUnicode_READ(kind, data, fill);
12361
12362 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 PyUnicode_WRITE(kind, data, 0, chr);
12365 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 }
12367
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012368 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 return (PyObject*) u;
12370}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371
12372#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012373static PyObject *
12374unicode__decimal2ascii(PyObject *self)
12375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012377}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378#endif
12379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012380PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012381 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012383Return True if S starts with the specified prefix, False otherwise.\n\
12384With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012385With optional end, stop comparing S at that position.\n\
12386prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387
12388static PyObject *
12389unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012392 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012394 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012395 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012396 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397
Jesus Ceaac451502011-04-20 17:09:23 +020012398 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012399 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012400 if (PyTuple_Check(subobj)) {
12401 Py_ssize_t i;
12402 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12403 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012405 if (substring == NULL)
12406 return NULL;
12407 result = tailmatch(self, substring, start, end, -1);
12408 Py_DECREF(substring);
12409 if (result) {
12410 Py_RETURN_TRUE;
12411 }
12412 }
12413 /* nothing matched */
12414 Py_RETURN_FALSE;
12415 }
12416 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012417 if (substring == NULL) {
12418 if (PyErr_ExceptionMatches(PyExc_TypeError))
12419 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12420 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012422 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012423 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012425 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426}
12427
12428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012429PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012430 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012432Return True if S ends with the specified suffix, False otherwise.\n\
12433With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012434With optional end, stop comparing S at that position.\n\
12435suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436
12437static PyObject *
12438unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012441 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012443 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012444 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012445 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446
Jesus Ceaac451502011-04-20 17:09:23 +020012447 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012448 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012449 if (PyTuple_Check(subobj)) {
12450 Py_ssize_t i;
12451 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12452 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012454 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012456 result = tailmatch(self, substring, start, end, +1);
12457 Py_DECREF(substring);
12458 if (result) {
12459 Py_RETURN_TRUE;
12460 }
12461 }
12462 Py_RETURN_FALSE;
12463 }
12464 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012465 if (substring == NULL) {
12466 if (PyErr_ExceptionMatches(PyExc_TypeError))
12467 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12468 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012470 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012471 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012473 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474}
12475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012477
12478PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012480\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012481Return a formatted version of S, using substitutions from args and kwargs.\n\
12482The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012483
Eric Smith27bbca62010-11-04 17:06:58 +000012484PyDoc_STRVAR(format_map__doc__,
12485 "S.format_map(mapping) -> str\n\
12486\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012487Return a formatted version of S, using substitutions from mapping.\n\
12488The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012489
Eric Smith4a7d76d2008-05-30 18:10:19 +000012490static PyObject *
12491unicode__format__(PyObject* self, PyObject* args)
12492{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012493 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012494
12495 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12496 return NULL;
12497
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012498 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012500 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012501}
12502
Eric Smith8c663262007-08-25 02:26:07 +000012503PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012505\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012506Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012507
12508static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012509unicode__sizeof__(PyUnicodeObject *v)
12510{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 Py_ssize_t size;
12512
12513 /* If it's a compact object, account for base structure +
12514 character data. */
12515 if (PyUnicode_IS_COMPACT_ASCII(v))
12516 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12517 else if (PyUnicode_IS_COMPACT(v))
12518 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012519 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 else {
12521 /* If it is a two-block object, account for base object, and
12522 for character block if present. */
12523 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012524 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012526 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 }
12528 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012529 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012530 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012532 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012533 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534
12535 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012536}
12537
12538PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012539 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012540
12541static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012542unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012543{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012544 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 if (!copy)
12546 return NULL;
12547 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012548}
12549
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550static PyMethodDef unicode_methods[] = {
12551
12552 /* Order is according to common usage: often used methods should
12553 appear first, since lookup is done sequentially. */
12554
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012555 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012556 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12557 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012558 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012559 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12560 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12561 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12562 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12563 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12564 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12565 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012566 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012567 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12568 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12569 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012570 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012571 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12572 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12573 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012574 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012575 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012576 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012577 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012578 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12579 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12580 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12581 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12582 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12583 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12584 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12585 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12586 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12587 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12588 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12589 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12590 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12591 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012592 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012593 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012594 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012595 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012596 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012597 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012598 {"maketrans", (PyCFunction) unicode_maketrans,
12599 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012600 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012601#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012602 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603#endif
12604
12605#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012606 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012607 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608#endif
12609
Benjamin Peterson14339b62009-01-31 16:36:08 +000012610 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611 {NULL, NULL}
12612};
12613
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012614static PyObject *
12615unicode_mod(PyObject *v, PyObject *w)
12616{
Brian Curtindfc80e32011-08-10 20:28:54 -050012617 if (!PyUnicode_Check(v))
12618 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012620}
12621
12622static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012623 0, /*nb_add*/
12624 0, /*nb_subtract*/
12625 0, /*nb_multiply*/
12626 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012627};
12628
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012630 (lenfunc) unicode_length, /* sq_length */
12631 PyUnicode_Concat, /* sq_concat */
12632 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12633 (ssizeargfunc) unicode_getitem, /* sq_item */
12634 0, /* sq_slice */
12635 0, /* sq_ass_item */
12636 0, /* sq_ass_slice */
12637 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638};
12639
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012640static PyObject*
12641unicode_subscript(PyUnicodeObject* self, PyObject* item)
12642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 if (PyUnicode_READY(self) == -1)
12644 return NULL;
12645
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012646 if (PyIndex_Check(item)) {
12647 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012648 if (i == -1 && PyErr_Occurred())
12649 return NULL;
12650 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012652 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012653 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012654 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012655 PyObject *result;
12656 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012657 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012658 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012662 return NULL;
12663 }
12664
12665 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 return PyUnicode_New(0, 0);
12667 } else if (start == 0 && step == 1 &&
12668 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012669 PyUnicode_CheckExact(self)) {
12670 Py_INCREF(self);
12671 return (PyObject *)self;
12672 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012673 return PyUnicode_Substring((PyObject*)self,
12674 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012675 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012676 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012677 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012678 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012679 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012680 src_data = PyUnicode_DATA(self);
12681 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12682 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012683 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012684 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012685 if (max_char >= kind_limit)
12686 break;
12687 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012688 }
12689 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012690 if (result == NULL)
12691 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012692 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012693 dest_data = PyUnicode_DATA(result);
12694
12695 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012696 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12697 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012698 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012699 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012700 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012701 } else {
12702 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12703 return NULL;
12704 }
12705}
12706
12707static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012708 (lenfunc)unicode_length, /* mp_length */
12709 (binaryfunc)unicode_subscript, /* mp_subscript */
12710 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012711};
12712
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714/* Helpers for PyUnicode_Format() */
12715
12716static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012717getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012719 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 (*p_argidx)++;
12722 if (arglen < 0)
12723 return args;
12724 else
12725 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726 }
12727 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729 return NULL;
12730}
12731
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012732/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012734static PyObject *
12735formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012737 char *p;
12738 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012740
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741 x = PyFloat_AsDouble(v);
12742 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012743 return NULL;
12744
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012747
Eric Smith0923d1d2009-04-16 20:16:10 +000012748 p = PyOS_double_to_string(x, type, prec,
12749 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012750 if (p == NULL)
12751 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012753 PyMem_Free(p);
12754 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755}
12756
Tim Peters38fd5b62000-09-21 05:43:11 +000012757static PyObject*
12758formatlong(PyObject *val, int flags, int prec, int type)
12759{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 char *buf;
12761 int len;
12762 PyObject *str; /* temporary string object. */
12763 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012764
Benjamin Peterson14339b62009-01-31 16:36:08 +000012765 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12766 if (!str)
12767 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012769 Py_DECREF(str);
12770 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012771}
12772
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012773static Py_UCS4
12774formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012776 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012777 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012779 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 goto onError;
12782 }
12783 else {
12784 /* Integer input truncated to a character */
12785 long x;
12786 x = PyLong_AsLong(v);
12787 if (x == -1 && PyErr_Occurred())
12788 goto onError;
12789
12790 if (x < 0 || x > 0x10ffff) {
12791 PyErr_SetString(PyExc_OverflowError,
12792 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012793 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 }
12795
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012796 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012797 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012798
Benjamin Peterson29060642009-01-31 22:14:21 +000012799 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012800 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012802 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803}
12804
Antoine Pitrou978b9d22011-10-07 12:35:48 +020012805static int
12806repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12807{
12808 int r;
12809 assert(count > 0);
12810 assert(PyUnicode_Check(obj));
12811 if (count > 5) {
12812 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
12813 if (repeated == NULL)
12814 return -1;
12815 r = _PyAccu_Accumulate(acc, repeated);
12816 Py_DECREF(repeated);
12817 return r;
12818 }
12819 else {
12820 do {
12821 if (_PyAccu_Accumulate(acc, obj))
12822 return -1;
12823 } while (--count);
12824 return 0;
12825 }
12826}
12827
Alexander Belopolsky40018472011-02-26 01:02:56 +000012828PyObject *
12829PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 void *fmt;
12832 int fmtkind;
12833 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012835 int r;
12836 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012839 PyObject *temp = NULL;
12840 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012842 _PyAccu acc;
12843 static PyObject *plus, *minus, *blank, *zero, *percent;
12844
12845 if (!plus && !(plus = get_latin1_char('+')))
12846 return NULL;
12847 if (!minus && !(minus = get_latin1_char('-')))
12848 return NULL;
12849 if (!blank && !(blank = get_latin1_char(' ')))
12850 return NULL;
12851 if (!zero && !(zero = get_latin1_char('0')))
12852 return NULL;
12853 if (!percent && !(percent = get_latin1_char('%')))
12854 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000012855
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 PyErr_BadInternalCall();
12858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12861 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012863 if (_PyAccu_Init(&acc))
12864 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 fmt = PyUnicode_DATA(uformat);
12866 fmtkind = PyUnicode_KIND(uformat);
12867 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12868 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 arglen = PyTuple_Size(args);
12872 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873 }
12874 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012875 arglen = -1;
12876 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012878 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012879 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881
12882 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012884 PyObject *nonfmt;
12885 Py_ssize_t nonfmtpos;
12886 nonfmtpos = fmtpos++;
12887 while (fmtcnt >= 0 &&
12888 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12889 fmtpos++;
12890 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012891 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012892 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12893 if (nonfmt == NULL)
12894 goto onError;
12895 r = _PyAccu_Accumulate(&acc, nonfmt);
12896 Py_DECREF(nonfmt);
12897 if (r)
12898 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012899 }
12900 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012901 /* Got a format specifier */
12902 int flags = 0;
12903 Py_ssize_t width = -1;
12904 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012906 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 int isnumok;
12908 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012909 void *pbuf = NULL;
12910 Py_ssize_t pindex, len;
12911 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 fmtpos++;
12914 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12915 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012916 Py_ssize_t keylen;
12917 PyObject *key;
12918 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012919
Benjamin Peterson29060642009-01-31 22:14:21 +000012920 if (dict == NULL) {
12921 PyErr_SetString(PyExc_TypeError,
12922 "format requires a mapping");
12923 goto onError;
12924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 /* Skip over balanced parentheses */
12929 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012933 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 if (fmtcnt < 0 || pcount > 0) {
12938 PyErr_SetString(PyExc_ValueError,
12939 "incomplete format key");
12940 goto onError;
12941 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012942 key = PyUnicode_Substring((PyObject*)uformat,
12943 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012944 if (key == NULL)
12945 goto onError;
12946 if (args_owned) {
12947 Py_DECREF(args);
12948 args_owned = 0;
12949 }
12950 args = PyObject_GetItem(dict, key);
12951 Py_DECREF(key);
12952 if (args == NULL) {
12953 goto onError;
12954 }
12955 args_owned = 1;
12956 arglen = -1;
12957 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012958 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012959 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012961 case '-': flags |= F_LJUST; continue;
12962 case '+': flags |= F_SIGN; continue;
12963 case ' ': flags |= F_BLANK; continue;
12964 case '#': flags |= F_ALT; continue;
12965 case '0': flags |= F_ZERO; continue;
12966 }
12967 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012968 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012969 if (c == '*') {
12970 v = getnextarg(args, arglen, &argidx);
12971 if (v == NULL)
12972 goto onError;
12973 if (!PyLong_Check(v)) {
12974 PyErr_SetString(PyExc_TypeError,
12975 "* wants int");
12976 goto onError;
12977 }
12978 width = PyLong_AsLong(v);
12979 if (width == -1 && PyErr_Occurred())
12980 goto onError;
12981 if (width < 0) {
12982 flags |= F_LJUST;
12983 width = -width;
12984 }
12985 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 }
12988 else if (c >= '0' && c <= '9') {
12989 width = c - '0';
12990 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012992 if (c < '0' || c > '9')
12993 break;
12994 if ((width*10) / 10 != width) {
12995 PyErr_SetString(PyExc_ValueError,
12996 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012997 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 }
12999 width = width*10 + (c - '0');
13000 }
13001 }
13002 if (c == '.') {
13003 prec = 0;
13004 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 if (c == '*') {
13007 v = getnextarg(args, arglen, &argidx);
13008 if (v == NULL)
13009 goto onError;
13010 if (!PyLong_Check(v)) {
13011 PyErr_SetString(PyExc_TypeError,
13012 "* wants int");
13013 goto onError;
13014 }
13015 prec = PyLong_AsLong(v);
13016 if (prec == -1 && PyErr_Occurred())
13017 goto onError;
13018 if (prec < 0)
13019 prec = 0;
13020 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013022 }
13023 else if (c >= '0' && c <= '9') {
13024 prec = c - '0';
13025 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 if (c < '0' || c > '9')
13028 break;
13029 if ((prec*10) / 10 != prec) {
13030 PyErr_SetString(PyExc_ValueError,
13031 "prec too big");
13032 goto onError;
13033 }
13034 prec = prec*10 + (c - '0');
13035 }
13036 }
13037 } /* prec */
13038 if (fmtcnt >= 0) {
13039 if (c == 'h' || c == 'l' || c == 'L') {
13040 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 }
13043 }
13044 if (fmtcnt < 0) {
13045 PyErr_SetString(PyExc_ValueError,
13046 "incomplete format");
13047 goto onError;
13048 }
13049 if (c != '%') {
13050 v = getnextarg(args, arglen, &argidx);
13051 if (v == NULL)
13052 goto onError;
13053 }
13054 sign = 0;
13055 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013056 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013057 switch (c) {
13058
13059 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013060 _PyAccu_Accumulate(&acc, percent);
13061 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013062
13063 case 's':
13064 case 'r':
13065 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013066 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 temp = v;
13068 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013069 }
13070 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013071 if (c == 's')
13072 temp = PyObject_Str(v);
13073 else if (c == 'r')
13074 temp = PyObject_Repr(v);
13075 else
13076 temp = PyObject_ASCII(v);
13077 if (temp == NULL)
13078 goto onError;
13079 if (PyUnicode_Check(temp))
13080 /* nothing to do */;
13081 else {
13082 Py_DECREF(temp);
13083 PyErr_SetString(PyExc_TypeError,
13084 "%s argument has non-string str()");
13085 goto onError;
13086 }
13087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013088 if (PyUnicode_READY(temp) == -1) {
13089 Py_CLEAR(temp);
13090 goto onError;
13091 }
13092 pbuf = PyUnicode_DATA(temp);
13093 kind = PyUnicode_KIND(temp);
13094 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013095 if (prec >= 0 && len > prec)
13096 len = prec;
13097 break;
13098
13099 case 'i':
13100 case 'd':
13101 case 'u':
13102 case 'o':
13103 case 'x':
13104 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 isnumok = 0;
13106 if (PyNumber_Check(v)) {
13107 PyObject *iobj=NULL;
13108
13109 if (PyLong_Check(v)) {
13110 iobj = v;
13111 Py_INCREF(iobj);
13112 }
13113 else {
13114 iobj = PyNumber_Long(v);
13115 }
13116 if (iobj!=NULL) {
13117 if (PyLong_Check(iobj)) {
13118 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013119 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 Py_DECREF(iobj);
13121 if (!temp)
13122 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 if (PyUnicode_READY(temp) == -1) {
13124 Py_CLEAR(temp);
13125 goto onError;
13126 }
13127 pbuf = PyUnicode_DATA(temp);
13128 kind = PyUnicode_KIND(temp);
13129 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013130 sign = 1;
13131 }
13132 else {
13133 Py_DECREF(iobj);
13134 }
13135 }
13136 }
13137 if (!isnumok) {
13138 PyErr_Format(PyExc_TypeError,
13139 "%%%c format: a number is required, "
13140 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13141 goto onError;
13142 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013143 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013145 fillobj = zero;
13146 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 break;
13148
13149 case 'e':
13150 case 'E':
13151 case 'f':
13152 case 'F':
13153 case 'g':
13154 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013155 temp = formatfloat(v, flags, prec, c);
13156 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 if (PyUnicode_READY(temp) == -1) {
13159 Py_CLEAR(temp);
13160 goto onError;
13161 }
13162 pbuf = PyUnicode_DATA(temp);
13163 kind = PyUnicode_KIND(temp);
13164 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013165 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013166 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013168 fillobj = zero;
13169 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013170 break;
13171
13172 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013173 {
13174 Py_UCS4 ch = formatchar(v);
13175 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013177 temp = _PyUnicode_FromUCS4(&ch, 1);
13178 if (temp == NULL)
13179 goto onError;
13180 pbuf = PyUnicode_DATA(temp);
13181 kind = PyUnicode_KIND(temp);
13182 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013184 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013185
13186 default:
13187 PyErr_Format(PyExc_ValueError,
13188 "unsupported format character '%c' (0x%x) "
13189 "at index %zd",
13190 (31<=c && c<=126) ? (char)c : '?',
13191 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 goto onError;
13194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 /* pbuf is initialized here. */
13196 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013198 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13199 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013201 pindex++;
13202 }
13203 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13204 signobj = plus;
13205 len--;
13206 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013207 }
13208 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013209 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013211 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 else
13213 sign = 0;
13214 }
13215 if (width < len)
13216 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013217 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013218 if (fill != ' ') {
13219 assert(signobj != NULL);
13220 if (_PyAccu_Accumulate(&acc, signobj))
13221 goto onError;
13222 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013223 if (width > len)
13224 width--;
13225 }
13226 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013228 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013230 second = get_latin1_char(
13231 PyUnicode_READ(kind, pbuf, pindex + 1));
13232 pindex += 2;
13233 if (second == NULL ||
13234 _PyAccu_Accumulate(&acc, zero) ||
13235 _PyAccu_Accumulate(&acc, second))
13236 goto onError;
13237 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013238 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013239 width -= 2;
13240 if (width < 0)
13241 width = 0;
13242 len -= 2;
13243 }
13244 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013245 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013246 if (repeat_accumulate(&acc, fillobj, width - len))
13247 goto onError;
13248 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 }
13250 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013251 if (sign) {
13252 assert(signobj != NULL);
13253 if (_PyAccu_Accumulate(&acc, signobj))
13254 goto onError;
13255 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013256 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13258 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013259 second = get_latin1_char(
13260 PyUnicode_READ(kind, pbuf, pindex + 1));
13261 pindex += 2;
13262 if (second == NULL ||
13263 _PyAccu_Accumulate(&acc, zero) ||
13264 _PyAccu_Accumulate(&acc, second))
13265 goto onError;
13266 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013267 }
13268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013270 if (temp != NULL) {
13271 assert(pbuf == PyUnicode_DATA(temp));
13272 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013273 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013274 else {
13275 const char *p = (const char *) pbuf;
13276 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013277 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013278 v = PyUnicode_FromKindAndData(kind, p, len);
13279 }
13280 if (v == NULL)
13281 goto onError;
13282 r = _PyAccu_Accumulate(&acc, v);
13283 Py_DECREF(v);
13284 if (r)
13285 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013286 if (width > len && repeat_accumulate(&acc, blank, width - len))
13287 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 if (dict && (argidx < arglen) && c != '%') {
13289 PyErr_SetString(PyExc_TypeError,
13290 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013291 goto onError;
13292 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013293 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013294 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295 } /* until end */
13296 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013297 PyErr_SetString(PyExc_TypeError,
13298 "not all arguments converted during string formatting");
13299 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300 }
13301
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013302 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305 }
13306 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013307 Py_XDECREF(temp);
13308 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309 return (PyObject *)result;
13310
Benjamin Peterson29060642009-01-31 22:14:21 +000013311 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013313 Py_XDECREF(temp);
13314 Py_XDECREF(second);
13315 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318 }
13319 return NULL;
13320}
13321
Jeremy Hylton938ace62002-07-17 16:30:39 +000013322static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013323unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13324
Tim Peters6d6c1a32001-08-02 04:15:00 +000013325static PyObject *
13326unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13327{
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013329 static char *kwlist[] = {"object", "encoding", "errors", 0};
13330 char *encoding = NULL;
13331 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013332
Benjamin Peterson14339b62009-01-31 16:36:08 +000013333 if (type != &PyUnicode_Type)
13334 return unicode_subtype_new(type, args, kwds);
13335 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013336 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013337 return NULL;
13338 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013340 if (encoding == NULL && errors == NULL)
13341 return PyObject_Str(x);
13342 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013344}
13345
Guido van Rossume023fe02001-08-30 03:12:59 +000013346static PyObject *
13347unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13348{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013349 PyUnicodeObject *unicode, *self;
13350 Py_ssize_t length, char_size;
13351 int share_wstr, share_utf8;
13352 unsigned int kind;
13353 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013354
Benjamin Peterson14339b62009-01-31 16:36:08 +000013355 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013356
13357 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13358 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013359 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013360 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013361 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013362 return NULL;
13363
13364 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13365 if (self == NULL) {
13366 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013367 return NULL;
13368 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013369 kind = PyUnicode_KIND(unicode);
13370 length = PyUnicode_GET_LENGTH(unicode);
13371
13372 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013373#ifdef Py_DEBUG
13374 _PyUnicode_HASH(self) = -1;
13375#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013376 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013377#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013378 _PyUnicode_STATE(self).interned = 0;
13379 _PyUnicode_STATE(self).kind = kind;
13380 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013381 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013382 _PyUnicode_STATE(self).ready = 1;
13383 _PyUnicode_WSTR(self) = NULL;
13384 _PyUnicode_UTF8_LENGTH(self) = 0;
13385 _PyUnicode_UTF8(self) = NULL;
13386 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013387 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013388
13389 share_utf8 = 0;
13390 share_wstr = 0;
13391 if (kind == PyUnicode_1BYTE_KIND) {
13392 char_size = 1;
13393 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13394 share_utf8 = 1;
13395 }
13396 else if (kind == PyUnicode_2BYTE_KIND) {
13397 char_size = 2;
13398 if (sizeof(wchar_t) == 2)
13399 share_wstr = 1;
13400 }
13401 else {
13402 assert(kind == PyUnicode_4BYTE_KIND);
13403 char_size = 4;
13404 if (sizeof(wchar_t) == 4)
13405 share_wstr = 1;
13406 }
13407
13408 /* Ensure we won't overflow the length. */
13409 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13410 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013411 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013412 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013413 data = PyObject_MALLOC((length + 1) * char_size);
13414 if (data == NULL) {
13415 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 goto onError;
13417 }
13418
Victor Stinnerc3c74152011-10-02 20:39:55 +020013419 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013420 if (share_utf8) {
13421 _PyUnicode_UTF8_LENGTH(self) = length;
13422 _PyUnicode_UTF8(self) = data;
13423 }
13424 if (share_wstr) {
13425 _PyUnicode_WSTR_LENGTH(self) = length;
13426 _PyUnicode_WSTR(self) = (wchar_t *)data;
13427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013428
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013429 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013430 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013431 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013432 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013433#ifdef Py_DEBUG
13434 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13435#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013436 return (PyObject *)self;
13437
13438onError:
13439 Py_DECREF(unicode);
13440 Py_DECREF(self);
13441 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013442}
13443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013444PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013446\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013447Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013448encoding defaults to the current default string encoding.\n\
13449errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013450
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013451static PyObject *unicode_iter(PyObject *seq);
13452
Guido van Rossumd57fd912000-03-10 22:53:23 +000013453PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013454 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013455 "str", /* tp_name */
13456 sizeof(PyUnicodeObject), /* tp_size */
13457 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013458 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013459 (destructor)unicode_dealloc, /* tp_dealloc */
13460 0, /* tp_print */
13461 0, /* tp_getattr */
13462 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013463 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013464 unicode_repr, /* tp_repr */
13465 &unicode_as_number, /* tp_as_number */
13466 &unicode_as_sequence, /* tp_as_sequence */
13467 &unicode_as_mapping, /* tp_as_mapping */
13468 (hashfunc) unicode_hash, /* tp_hash*/
13469 0, /* tp_call*/
13470 (reprfunc) unicode_str, /* tp_str */
13471 PyObject_GenericGetAttr, /* tp_getattro */
13472 0, /* tp_setattro */
13473 0, /* tp_as_buffer */
13474 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013476 unicode_doc, /* tp_doc */
13477 0, /* tp_traverse */
13478 0, /* tp_clear */
13479 PyUnicode_RichCompare, /* tp_richcompare */
13480 0, /* tp_weaklistoffset */
13481 unicode_iter, /* tp_iter */
13482 0, /* tp_iternext */
13483 unicode_methods, /* tp_methods */
13484 0, /* tp_members */
13485 0, /* tp_getset */
13486 &PyBaseObject_Type, /* tp_base */
13487 0, /* tp_dict */
13488 0, /* tp_descr_get */
13489 0, /* tp_descr_set */
13490 0, /* tp_dictoffset */
13491 0, /* tp_init */
13492 0, /* tp_alloc */
13493 unicode_new, /* tp_new */
13494 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013495};
13496
13497/* Initialize the Unicode implementation */
13498
Thomas Wouters78890102000-07-22 19:25:51 +000013499void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013500{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013501 int i;
13502
Thomas Wouters477c8d52006-05-27 19:21:47 +000013503 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013505 0x000A, /* LINE FEED */
13506 0x000D, /* CARRIAGE RETURN */
13507 0x001C, /* FILE SEPARATOR */
13508 0x001D, /* GROUP SEPARATOR */
13509 0x001E, /* RECORD SEPARATOR */
13510 0x0085, /* NEXT LINE */
13511 0x2028, /* LINE SEPARATOR */
13512 0x2029, /* PARAGRAPH SEPARATOR */
13513 };
13514
Fred Drakee4315f52000-05-09 19:53:39 +000013515 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013516 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013517 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013518 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013520
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013521 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013522 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013523 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013524 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013525
13526 /* initialize the linebreak bloom filter */
13527 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013528 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013529 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013530
13531 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013532}
13533
13534/* Finalize the Unicode implementation */
13535
Christian Heimesa156e092008-02-16 07:38:31 +000013536int
13537PyUnicode_ClearFreeList(void)
13538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013539 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013540}
13541
Guido van Rossumd57fd912000-03-10 22:53:23 +000013542void
Thomas Wouters78890102000-07-22 19:25:51 +000013543_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013544{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013545 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013546
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013547 Py_XDECREF(unicode_empty);
13548 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013549
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013550 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 if (unicode_latin1[i]) {
13552 Py_DECREF(unicode_latin1[i]);
13553 unicode_latin1[i] = NULL;
13554 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013555 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013556 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013557 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013558}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013559
Walter Dörwald16807132007-05-25 13:52:07 +000013560void
13561PyUnicode_InternInPlace(PyObject **p)
13562{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013563 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13564 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013565#ifdef Py_DEBUG
13566 assert(s != NULL);
13567 assert(_PyUnicode_CHECK(s));
13568#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013569 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013570 return;
13571#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013572 /* If it's a subclass, we don't really know what putting
13573 it in the interned dict might do. */
13574 if (!PyUnicode_CheckExact(s))
13575 return;
13576 if (PyUnicode_CHECK_INTERNED(s))
13577 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013578 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013579 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580 return;
13581 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013582 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013583 if (interned == NULL) {
13584 interned = PyDict_New();
13585 if (interned == NULL) {
13586 PyErr_Clear(); /* Don't leave an exception */
13587 return;
13588 }
13589 }
13590 /* It might be that the GetItem call fails even
13591 though the key is present in the dictionary,
13592 namely when this happens during a stack overflow. */
13593 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013595 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013596
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 if (t) {
13598 Py_INCREF(t);
13599 Py_DECREF(*p);
13600 *p = t;
13601 return;
13602 }
Walter Dörwald16807132007-05-25 13:52:07 +000013603
Benjamin Peterson14339b62009-01-31 16:36:08 +000013604 PyThreadState_GET()->recursion_critical = 1;
13605 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13606 PyErr_Clear();
13607 PyThreadState_GET()->recursion_critical = 0;
13608 return;
13609 }
13610 PyThreadState_GET()->recursion_critical = 0;
13611 /* The two references in interned are not counted by refcnt.
13612 The deallocator will take care of this */
13613 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013615}
13616
13617void
13618PyUnicode_InternImmortal(PyObject **p)
13619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013620 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13621
Benjamin Peterson14339b62009-01-31 16:36:08 +000013622 PyUnicode_InternInPlace(p);
13623 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013624 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013625 Py_INCREF(*p);
13626 }
Walter Dörwald16807132007-05-25 13:52:07 +000013627}
13628
13629PyObject *
13630PyUnicode_InternFromString(const char *cp)
13631{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013632 PyObject *s = PyUnicode_FromString(cp);
13633 if (s == NULL)
13634 return NULL;
13635 PyUnicode_InternInPlace(&s);
13636 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013637}
13638
Alexander Belopolsky40018472011-02-26 01:02:56 +000013639void
13640_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013641{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013642 PyObject *keys;
13643 PyUnicodeObject *s;
13644 Py_ssize_t i, n;
13645 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013646
Benjamin Peterson14339b62009-01-31 16:36:08 +000013647 if (interned == NULL || !PyDict_Check(interned))
13648 return;
13649 keys = PyDict_Keys(interned);
13650 if (keys == NULL || !PyList_Check(keys)) {
13651 PyErr_Clear();
13652 return;
13653 }
Walter Dörwald16807132007-05-25 13:52:07 +000013654
Benjamin Peterson14339b62009-01-31 16:36:08 +000013655 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13656 detector, interned unicode strings are not forcibly deallocated;
13657 rather, we give them their stolen references back, and then clear
13658 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013659
Benjamin Peterson14339b62009-01-31 16:36:08 +000013660 n = PyList_GET_SIZE(keys);
13661 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013663 for (i = 0; i < n; i++) {
13664 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013665 if (PyUnicode_READY(s) == -1) {
13666 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013667 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013669 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013670 case SSTATE_NOT_INTERNED:
13671 /* XXX Shouldn't happen */
13672 break;
13673 case SSTATE_INTERNED_IMMORTAL:
13674 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013676 break;
13677 case SSTATE_INTERNED_MORTAL:
13678 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013679 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 break;
13681 default:
13682 Py_FatalError("Inconsistent interned string state.");
13683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013684 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013685 }
13686 fprintf(stderr, "total size of all interned strings: "
13687 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13688 "mortal/immortal\n", mortal_size, immortal_size);
13689 Py_DECREF(keys);
13690 PyDict_Clear(interned);
13691 Py_DECREF(interned);
13692 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013693}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013694
13695
13696/********************* Unicode Iterator **************************/
13697
13698typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013699 PyObject_HEAD
13700 Py_ssize_t it_index;
13701 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013702} unicodeiterobject;
13703
13704static void
13705unicodeiter_dealloc(unicodeiterobject *it)
13706{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013707 _PyObject_GC_UNTRACK(it);
13708 Py_XDECREF(it->it_seq);
13709 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013710}
13711
13712static int
13713unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13714{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013715 Py_VISIT(it->it_seq);
13716 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013717}
13718
13719static PyObject *
13720unicodeiter_next(unicodeiterobject *it)
13721{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013722 PyUnicodeObject *seq;
13723 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013724
Benjamin Peterson14339b62009-01-31 16:36:08 +000013725 assert(it != NULL);
13726 seq = it->it_seq;
13727 if (seq == NULL)
13728 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013729 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013731 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13732 int kind = PyUnicode_KIND(seq);
13733 void *data = PyUnicode_DATA(seq);
13734 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13735 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013736 if (item != NULL)
13737 ++it->it_index;
13738 return item;
13739 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013740
Benjamin Peterson14339b62009-01-31 16:36:08 +000013741 Py_DECREF(seq);
13742 it->it_seq = NULL;
13743 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013744}
13745
13746static PyObject *
13747unicodeiter_len(unicodeiterobject *it)
13748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013749 Py_ssize_t len = 0;
13750 if (it->it_seq)
13751 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13752 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013753}
13754
13755PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13756
13757static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013758 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013760 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013761};
13762
13763PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013764 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13765 "str_iterator", /* tp_name */
13766 sizeof(unicodeiterobject), /* tp_basicsize */
13767 0, /* tp_itemsize */
13768 /* methods */
13769 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13770 0, /* tp_print */
13771 0, /* tp_getattr */
13772 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013773 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013774 0, /* tp_repr */
13775 0, /* tp_as_number */
13776 0, /* tp_as_sequence */
13777 0, /* tp_as_mapping */
13778 0, /* tp_hash */
13779 0, /* tp_call */
13780 0, /* tp_str */
13781 PyObject_GenericGetAttr, /* tp_getattro */
13782 0, /* tp_setattro */
13783 0, /* tp_as_buffer */
13784 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13785 0, /* tp_doc */
13786 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13787 0, /* tp_clear */
13788 0, /* tp_richcompare */
13789 0, /* tp_weaklistoffset */
13790 PyObject_SelfIter, /* tp_iter */
13791 (iternextfunc)unicodeiter_next, /* tp_iternext */
13792 unicodeiter_methods, /* tp_methods */
13793 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013794};
13795
13796static PyObject *
13797unicode_iter(PyObject *seq)
13798{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013799 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013800
Benjamin Peterson14339b62009-01-31 16:36:08 +000013801 if (!PyUnicode_Check(seq)) {
13802 PyErr_BadInternalCall();
13803 return NULL;
13804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805 if (PyUnicode_READY(seq) == -1)
13806 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013807 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13808 if (it == NULL)
13809 return NULL;
13810 it->it_index = 0;
13811 Py_INCREF(seq);
13812 it->it_seq = (PyUnicodeObject *)seq;
13813 _PyObject_GC_TRACK(it);
13814 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013815}
13816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013817#define UNIOP(x) Py_UNICODE_##x
13818#define UNIOP_t Py_UNICODE
13819#include "uniops.h"
13820#undef UNIOP
13821#undef UNIOP_t
13822#define UNIOP(x) Py_UCS4_##x
13823#define UNIOP_t Py_UCS4
13824#include "uniops.h"
13825#undef UNIOP
13826#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013827
Victor Stinner71133ff2010-09-01 23:43:53 +000013828Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013829PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013830{
13831 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13832 Py_UNICODE *copy;
13833 Py_ssize_t size;
13834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013835 if (!PyUnicode_Check(unicode)) {
13836 PyErr_BadArgument();
13837 return NULL;
13838 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013839 /* Ensure we won't overflow the size. */
13840 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13841 PyErr_NoMemory();
13842 return NULL;
13843 }
13844 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13845 size *= sizeof(Py_UNICODE);
13846 copy = PyMem_Malloc(size);
13847 if (copy == NULL) {
13848 PyErr_NoMemory();
13849 return NULL;
13850 }
13851 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13852 return copy;
13853}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013854
Georg Brandl66c221e2010-10-14 07:04:07 +000013855/* A _string module, to export formatter_parser and formatter_field_name_split
13856 to the string.Formatter class implemented in Python. */
13857
13858static PyMethodDef _string_methods[] = {
13859 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13860 METH_O, PyDoc_STR("split the argument as a field name")},
13861 {"formatter_parser", (PyCFunction) formatter_parser,
13862 METH_O, PyDoc_STR("parse the argument as a format string")},
13863 {NULL, NULL}
13864};
13865
13866static struct PyModuleDef _string_module = {
13867 PyModuleDef_HEAD_INIT,
13868 "_string",
13869 PyDoc_STR("string helper module"),
13870 0,
13871 _string_methods,
13872 NULL,
13873 NULL,
13874 NULL,
13875 NULL
13876};
13877
13878PyMODINIT_FUNC
13879PyInit__string(void)
13880{
13881 return PyModule_Create(&_string_module);
13882}
13883
13884
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013885#ifdef __cplusplus
13886}
13887#endif