blob: 3a0f4687511a36abdddaaa7688419aa5004c640e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200242static void copy_characters(
243 PyObject *to, Py_ssize_t to_start,
244 PyObject *from, Py_ssize_t from_start,
245 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200246#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200247static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200248#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static PyObject *
251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
253 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
259 const Py_UNICODE *unicode, Py_ssize_t size,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200307/* FIXME: use PyObject* type for op */
308_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200309{
310 PyASCIIObject *ascii;
311 unsigned int kind;
312
313 assert(PyUnicode_Check(op));
314
315 ascii = (PyASCIIObject *)op;
316 kind = ascii->state.kind;
317
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
321 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200323 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200324 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200325
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 if (ascii->state.compact == 1) {
327 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(kind == PyUnicode_1BYTE_KIND
329 || kind == PyUnicode_2BYTE_KIND
330 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200332 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert (compact->utf8 != data);
334 } else {
335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
345 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
394 if (kind == PyUnicode_1BYTE_KIND) {
395 if (ascii->state.ascii == 0)
396 assert(maxchar >= 128);
397 else
398 assert(maxchar < 128);
399 }
400 else if (kind == PyUnicode_2BYTE_KIND)
401 assert(maxchar >= 0x100);
402 else
403 assert(maxchar >= 0x10000);
404 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200405 if (check_content && !unicode_is_singleton((PyObject*)ascii))
406 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400407 return 1;
408}
Victor Stinner910337b2011-10-03 03:20:16 +0200409#endif
410
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411/* --- Bloom Filters ----------------------------------------------------- */
412
413/* stuff to implement simple "bloom filters" for Unicode characters.
414 to keep things simple, we use a single bitmask, using the least 5
415 bits from each unicode characters as the bit index. */
416
417/* the linebreak mask is set up by Unicode_Init below */
418
Antoine Pitrouf068f942010-01-13 14:19:12 +0000419#if LONG_BIT >= 128
420#define BLOOM_WIDTH 128
421#elif LONG_BIT >= 64
422#define BLOOM_WIDTH 64
423#elif LONG_BIT >= 32
424#define BLOOM_WIDTH 32
425#else
426#error "LONG_BIT is smaller than 32"
427#endif
428
Thomas Wouters477c8d52006-05-27 19:21:47 +0000429#define BLOOM_MASK unsigned long
430
431static BLOOM_MASK bloom_linebreak;
432
Antoine Pitrouf068f942010-01-13 14:19:12 +0000433#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
434#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000435
Benjamin Peterson29060642009-01-31 22:14:21 +0000436#define BLOOM_LINEBREAK(ch) \
437 ((ch) < 128U ? ascii_linebreak[(ch)] : \
438 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000439
Alexander Belopolsky40018472011-02-26 01:02:56 +0000440Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442{
443 /* calculate simple bloom-style bitmask for a given unicode string */
444
Antoine Pitrouf068f942010-01-13 14:19:12 +0000445 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000446 Py_ssize_t i;
447
448 mask = 0;
449 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000451
452 return mask;
453}
454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200455#define BLOOM_MEMBER(mask, chr, str) \
456 (BLOOM(mask, chr) \
457 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459/* --- Unicode Object ----------------------------------------------------- */
460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200462fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463
464Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
465 Py_ssize_t size, Py_UCS4 ch,
466 int direction)
467{
468 /* like wcschr, but doesn't stop at NULL characters */
469 Py_ssize_t i;
470 if (direction == 1) {
471 for(i = 0; i < size; i++)
472 if (PyUnicode_READ(kind, s, i) == ch)
473 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
474 }
475 else {
476 for(i = size-1; i >= 0; i--)
477 if (PyUnicode_READ(kind, s, i) == ch)
478 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
479 }
480 return NULL;
481}
482
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483static PyObject*
484resize_compact(PyObject *unicode, Py_ssize_t length)
485{
486 Py_ssize_t char_size;
487 Py_ssize_t struct_size;
488 Py_ssize_t new_size;
489 int share_wstr;
490
491 assert(PyUnicode_IS_READY(unicode));
492 char_size = PyUnicode_CHARACTER_SIZE(unicode);
493 if (PyUnicode_IS_COMPACT_ASCII(unicode))
494 struct_size = sizeof(PyASCIIObject);
495 else
496 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200497 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200498
499 _Py_DEC_REFTOTAL;
500 _Py_ForgetReference(unicode);
501
502 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
503 PyErr_NoMemory();
504 return NULL;
505 }
506 new_size = (struct_size + (length + 1) * char_size);
507
508 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
509 if (unicode == NULL) {
510 PyObject_Del(unicode);
511 PyErr_NoMemory();
512 return NULL;
513 }
514 _Py_NewReference(unicode);
515 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200516 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200517 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200518 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
519 _PyUnicode_WSTR_LENGTH(unicode) = length;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
522 length, 0);
523 return unicode;
524}
525
Alexander Belopolsky40018472011-02-26 01:02:56 +0000526static int
Victor Stinner95663112011-10-04 01:03:50 +0200527resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528{
Victor Stinner95663112011-10-04 01:03:50 +0200529 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200531 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000532
Victor Stinner95663112011-10-04 01:03:50 +0200533 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534
535 if (PyUnicode_IS_READY(unicode)) {
536 Py_ssize_t char_size;
537 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200538 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200539 void *data;
540
541 data = _PyUnicode_DATA_ANY(unicode);
542 assert(data != NULL);
543 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200544 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
545 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200546 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
547 {
548 PyObject_DEL(_PyUnicode_UTF8(unicode));
549 _PyUnicode_UTF8(unicode) = NULL;
550 _PyUnicode_UTF8_LENGTH(unicode) = 0;
551 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552
553 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
554 PyErr_NoMemory();
555 return -1;
556 }
557 new_size = (length + 1) * char_size;
558
559 data = (PyObject *)PyObject_REALLOC(data, new_size);
560 if (data == NULL) {
561 PyErr_NoMemory();
562 return -1;
563 }
564 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200565 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200566 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200567 _PyUnicode_WSTR_LENGTH(unicode) = length;
568 }
569 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200570 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200571 _PyUnicode_UTF8_LENGTH(unicode) = length;
572 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573 _PyUnicode_LENGTH(unicode) = length;
574 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200575 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200576 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579 }
Victor Stinner95663112011-10-04 01:03:50 +0200580 assert(_PyUnicode_WSTR(unicode) != NULL);
581
582 /* check for integer overflow */
583 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
584 PyErr_NoMemory();
585 return -1;
586 }
587 wstr = _PyUnicode_WSTR(unicode);
588 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
589 if (!wstr) {
590 PyErr_NoMemory();
591 return -1;
592 }
593 _PyUnicode_WSTR(unicode) = wstr;
594 _PyUnicode_WSTR(unicode)[length] = 0;
595 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 return 0;
598}
599
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600static PyObject*
601resize_copy(PyObject *unicode, Py_ssize_t length)
602{
603 Py_ssize_t copy_length;
604 if (PyUnicode_IS_COMPACT(unicode)) {
605 PyObject *copy;
606 assert(PyUnicode_IS_READY(unicode));
607
608 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
609 if (copy == NULL)
610 return NULL;
611
612 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200613 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200614 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200615 }
616 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 assert(_PyUnicode_WSTR(unicode) != NULL);
619 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200620 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621 if (w == NULL)
622 return NULL;
623 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
624 copy_length = Py_MIN(copy_length, length);
625 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
626 copy_length);
627 return (PyObject*)w;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000632 Ux0000 terminated; some code (e.g. new_identifier)
633 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000636 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637
638*/
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640#ifdef Py_DEBUG
641int unicode_old_new_calls = 0;
642#endif
643
Alexander Belopolsky40018472011-02-26 01:02:56 +0000644static PyUnicodeObject *
645_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646{
647 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649
Thomas Wouters477c8d52006-05-27 19:21:47 +0000650 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 if (length == 0 && unicode_empty != NULL) {
652 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200653 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 }
655
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000656 /* Ensure we won't overflow the size. */
657 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
658 return (PyUnicodeObject *)PyErr_NoMemory();
659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660 if (length < 0) {
661 PyErr_SetString(PyExc_SystemError,
662 "Negative size passed to _PyUnicode_New");
663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666#ifdef Py_DEBUG
667 ++unicode_old_new_calls;
668#endif
669
670 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
671 if (unicode == NULL)
672 return NULL;
673 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
674 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
675 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000676 PyErr_NoMemory();
677 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679
Jeremy Hyltond8082792003-09-16 19:41:39 +0000680 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000681 * the caller fails before initializing str -- unicode_resize()
682 * reads str[0], and the Keep-Alive optimization can keep memory
683 * allocated for str alive across a call to unicode_dealloc(unicode).
684 * We don't want unicode_resize to read uninitialized memory in
685 * that case.
686 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200687 _PyUnicode_WSTR(unicode)[0] = 0;
688 _PyUnicode_WSTR(unicode)[length] = 0;
689 _PyUnicode_WSTR_LENGTH(unicode) = length;
690 _PyUnicode_HASH(unicode) = -1;
691 _PyUnicode_STATE(unicode).interned = 0;
692 _PyUnicode_STATE(unicode).kind = 0;
693 _PyUnicode_STATE(unicode).compact = 0;
694 _PyUnicode_STATE(unicode).ready = 0;
695 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200696 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200698 _PyUnicode_UTF8(unicode) = NULL;
699 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000701
Benjamin Peterson29060642009-01-31 22:14:21 +0000702 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000703 /* XXX UNREF/NEWREF interface should be more symmetrical */
704 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000705 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000706 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708}
709
Victor Stinnerf42dc442011-10-02 23:33:16 +0200710static const char*
711unicode_kind_name(PyObject *unicode)
712{
Victor Stinner42dfd712011-10-03 14:41:45 +0200713 /* don't check consistency: unicode_kind_name() is called from
714 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200715 if (!PyUnicode_IS_COMPACT(unicode))
716 {
717 if (!PyUnicode_IS_READY(unicode))
718 return "wstr";
719 switch(PyUnicode_KIND(unicode))
720 {
721 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200722 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200723 return "legacy ascii";
724 else
725 return "legacy latin1";
726 case PyUnicode_2BYTE_KIND:
727 return "legacy UCS2";
728 case PyUnicode_4BYTE_KIND:
729 return "legacy UCS4";
730 default:
731 return "<legacy invalid kind>";
732 }
733 }
734 assert(PyUnicode_IS_READY(unicode));
735 switch(PyUnicode_KIND(unicode))
736 {
737 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 return "ascii";
740 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200742 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200743 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200744 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200745 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200746 default:
747 return "<invalid compact kind>";
748 }
749}
750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751#ifdef Py_DEBUG
752int unicode_new_new_calls = 0;
753
754/* Functions wrapping macros for use in debugger */
755char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200756 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757}
758
759void *_PyUnicode_compact_data(void *unicode) {
760 return _PyUnicode_COMPACT_DATA(unicode);
761}
762void *_PyUnicode_data(void *unicode){
763 printf("obj %p\n", unicode);
764 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
765 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
766 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
767 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
768 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
769 return PyUnicode_DATA(unicode);
770}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200771
772void
773_PyUnicode_Dump(PyObject *op)
774{
775 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200776 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
777 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
778 void *data;
779 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
780 if (ascii->state.compact)
781 data = (compact + 1);
782 else
783 data = unicode->data.any;
784 if (ascii->wstr == data)
785 printf("shared ");
786 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200787 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200788 printf(" (%zu), ", compact->wstr_length);
789 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
790 printf("shared ");
791 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200792 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200793 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200794}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
797PyObject *
798PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
799{
800 PyObject *obj;
801 PyCompactUnicodeObject *unicode;
802 void *data;
803 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200804 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805 Py_ssize_t char_size;
806 Py_ssize_t struct_size;
807
808 /* Optimization for empty strings */
809 if (size == 0 && unicode_empty != NULL) {
810 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200811 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812 }
813
814#ifdef Py_DEBUG
815 ++unicode_new_new_calls;
816#endif
817
Victor Stinner9e9d6892011-10-04 01:02:02 +0200818 is_ascii = 0;
819 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 struct_size = sizeof(PyCompactUnicodeObject);
821 if (maxchar < 128) {
822 kind_state = PyUnicode_1BYTE_KIND;
823 char_size = 1;
824 is_ascii = 1;
825 struct_size = sizeof(PyASCIIObject);
826 }
827 else if (maxchar < 256) {
828 kind_state = PyUnicode_1BYTE_KIND;
829 char_size = 1;
830 }
831 else if (maxchar < 65536) {
832 kind_state = PyUnicode_2BYTE_KIND;
833 char_size = 2;
834 if (sizeof(wchar_t) == 2)
835 is_sharing = 1;
836 }
837 else {
838 kind_state = PyUnicode_4BYTE_KIND;
839 char_size = 4;
840 if (sizeof(wchar_t) == 4)
841 is_sharing = 1;
842 }
843
844 /* Ensure we won't overflow the size. */
845 if (size < 0) {
846 PyErr_SetString(PyExc_SystemError,
847 "Negative size passed to PyUnicode_New");
848 return NULL;
849 }
850 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
851 return PyErr_NoMemory();
852
853 /* Duplicated allocation code from _PyObject_New() instead of a call to
854 * PyObject_New() so we are able to allocate space for the object and
855 * it's data buffer.
856 */
857 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
858 if (obj == NULL)
859 return PyErr_NoMemory();
860 obj = PyObject_INIT(obj, &PyUnicode_Type);
861 if (obj == NULL)
862 return NULL;
863
864 unicode = (PyCompactUnicodeObject *)obj;
865 if (is_ascii)
866 data = ((PyASCIIObject*)obj) + 1;
867 else
868 data = unicode + 1;
869 _PyUnicode_LENGTH(unicode) = size;
870 _PyUnicode_HASH(unicode) = -1;
871 _PyUnicode_STATE(unicode).interned = 0;
872 _PyUnicode_STATE(unicode).kind = kind_state;
873 _PyUnicode_STATE(unicode).compact = 1;
874 _PyUnicode_STATE(unicode).ready = 1;
875 _PyUnicode_STATE(unicode).ascii = is_ascii;
876 if (is_ascii) {
877 ((char*)data)[size] = 0;
878 _PyUnicode_WSTR(unicode) = NULL;
879 }
880 else if (kind_state == PyUnicode_1BYTE_KIND) {
881 ((char*)data)[size] = 0;
882 _PyUnicode_WSTR(unicode) = NULL;
883 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200885 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200886 }
887 else {
888 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 if (kind_state == PyUnicode_2BYTE_KIND)
891 ((Py_UCS2*)data)[size] = 0;
892 else /* kind_state == PyUnicode_4BYTE_KIND */
893 ((Py_UCS4*)data)[size] = 0;
894 if (is_sharing) {
895 _PyUnicode_WSTR_LENGTH(unicode) = size;
896 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
897 }
898 else {
899 _PyUnicode_WSTR_LENGTH(unicode) = 0;
900 _PyUnicode_WSTR(unicode) = NULL;
901 }
902 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200903 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904 return obj;
905}
906
907#if SIZEOF_WCHAR_T == 2
908/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
909 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200910 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911
912 This function assumes that unicode can hold one more code point than wstr
913 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200914static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
916 PyUnicodeObject *unicode)
917{
918 const wchar_t *iter;
919 Py_UCS4 *ucs4_out;
920
Victor Stinner910337b2011-10-03 03:20:16 +0200921 assert(unicode != NULL);
922 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
924 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
925
926 for (iter = begin; iter < end; ) {
927 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
928 _PyUnicode_GET_LENGTH(unicode)));
929 if (*iter >= 0xD800 && *iter <= 0xDBFF
930 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
931 {
932 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
933 iter += 2;
934 }
935 else {
936 *ucs4_out++ = *iter;
937 iter++;
938 }
939 }
940 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
941 _PyUnicode_GET_LENGTH(unicode)));
942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943}
944#endif
945
Victor Stinnercd9950f2011-10-02 00:34:53 +0200946static int
947_PyUnicode_Dirty(PyObject *unicode)
948{
Victor Stinner910337b2011-10-03 03:20:16 +0200949 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200950 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200951 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200952 "Cannot modify a string having more than 1 reference");
953 return -1;
954 }
955 _PyUnicode_DIRTY(unicode);
956 return 0;
957}
958
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200959static int
960_copy_characters(PyObject *to, Py_ssize_t to_start,
961 PyObject *from, Py_ssize_t from_start,
962 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200964 unsigned int from_kind, to_kind;
965 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200966 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200968 assert(PyUnicode_Check(from));
969 assert(PyUnicode_Check(to));
970 assert(PyUnicode_IS_READY(from));
971 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200973 assert(PyUnicode_GET_LENGTH(from) >= how_many);
974 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
975 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200977 if (how_many == 0)
978 return 0;
979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200981 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200983 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200985#ifdef Py_DEBUG
986 if (!check_maxchar
987 && (from_kind > to_kind
988 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200989 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200990 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
991 Py_UCS4 ch;
992 Py_ssize_t i;
993 for (i=0; i < how_many; i++) {
994 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
995 assert(ch <= to_maxchar);
996 }
997 }
998#endif
999 fast = (from_kind == to_kind);
1000 if (check_maxchar
1001 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1002 {
1003 /* deny latin1 => ascii */
1004 fast = 0;
1005 }
1006
1007 if (fast) {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001008 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001011 + PyUnicode_KIND_SIZE(from_kind, from_start),
1012 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001014 else if (from_kind == PyUnicode_1BYTE_KIND
1015 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001016 {
1017 _PyUnicode_CONVERT_BYTES(
1018 Py_UCS1, Py_UCS2,
1019 PyUnicode_1BYTE_DATA(from) + from_start,
1020 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1021 PyUnicode_2BYTE_DATA(to) + to_start
1022 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001023 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001024 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001025 && to_kind == PyUnicode_4BYTE_KIND)
1026 {
1027 _PyUnicode_CONVERT_BYTES(
1028 Py_UCS1, Py_UCS4,
1029 PyUnicode_1BYTE_DATA(from) + from_start,
1030 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1031 PyUnicode_4BYTE_DATA(to) + to_start
1032 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001033 }
1034 else if (from_kind == PyUnicode_2BYTE_KIND
1035 && to_kind == PyUnicode_4BYTE_KIND)
1036 {
1037 _PyUnicode_CONVERT_BYTES(
1038 Py_UCS2, Py_UCS4,
1039 PyUnicode_2BYTE_DATA(from) + from_start,
1040 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1041 PyUnicode_4BYTE_DATA(to) + to_start
1042 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001043 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001045 /* check if max_char(from substring) <= max_char(to) */
1046 if (from_kind > to_kind
1047 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001048 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001049 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001050 /* slow path to check for character overflow */
1051 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001052 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001053 Py_ssize_t i;
1054
Victor Stinner56c161a2011-10-06 02:47:11 +02001055#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001056 for (i=0; i < how_many; i++) {
1057 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001058 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001059 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1060 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001061#else
1062 if (!check_maxchar) {
1063 for (i=0; i < how_many; i++) {
1064 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1065 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1066 }
1067 }
1068 else {
1069 for (i=0; i < how_many; i++) {
1070 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1071 if (ch > to_maxchar)
1072 return 1;
1073 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1074 }
1075 }
1076#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001077 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001078 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001079 assert(0 && "inconsistent state");
1080 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001081 }
1082 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001083 return 0;
1084}
1085
1086static void
1087copy_characters(PyObject *to, Py_ssize_t to_start,
1088 PyObject *from, Py_ssize_t from_start,
1089 Py_ssize_t how_many)
1090{
1091 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1092}
1093
1094Py_ssize_t
1095PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1096 PyObject *from, Py_ssize_t from_start,
1097 Py_ssize_t how_many)
1098{
1099 int err;
1100
1101 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1102 PyErr_BadInternalCall();
1103 return -1;
1104 }
1105
1106 if (PyUnicode_READY(from))
1107 return -1;
1108 if (PyUnicode_READY(to))
1109 return -1;
1110
1111 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1112 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1113 PyErr_Format(PyExc_SystemError,
1114 "Cannot write %zi characters at %zi "
1115 "in a string of %zi characters",
1116 how_many, to_start, PyUnicode_GET_LENGTH(to));
1117 return -1;
1118 }
1119
1120 if (how_many == 0)
1121 return 0;
1122
1123 if (_PyUnicode_Dirty(to))
1124 return -1;
1125
1126 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1127 if (err) {
1128 PyErr_Format(PyExc_SystemError,
1129 "Cannot copy %s characters "
1130 "into a string of %s characters",
1131 unicode_kind_name(from),
1132 unicode_kind_name(to));
1133 return -1;
1134 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136}
1137
Victor Stinner17222162011-09-28 22:15:37 +02001138/* Find the maximum code point and count the number of surrogate pairs so a
1139 correct string length can be computed before converting a string to UCS4.
1140 This function counts single surrogates as a character and not as a pair.
1141
1142 Return 0 on success, or -1 on error. */
1143static int
1144find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1145 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
1147 const wchar_t *iter;
1148
Victor Stinnerc53be962011-10-02 21:33:54 +02001149 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150 *num_surrogates = 0;
1151 *maxchar = 0;
1152
1153 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001154 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001156#if SIZEOF_WCHAR_T != 2
1157 if (*maxchar >= 0x10000)
1158 return 0;
1159#endif
1160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161#if SIZEOF_WCHAR_T == 2
1162 if (*iter >= 0xD800 && *iter <= 0xDBFF
1163 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1164 {
1165 Py_UCS4 surrogate_val;
1166 surrogate_val = (((iter[0] & 0x3FF)<<10)
1167 | (iter[1] & 0x3FF)) + 0x10000;
1168 ++(*num_surrogates);
1169 if (surrogate_val > *maxchar)
1170 *maxchar = surrogate_val;
1171 iter += 2;
1172 }
1173 else
1174 iter++;
1175#else
1176 iter++;
1177#endif
1178 }
1179 return 0;
1180}
1181
1182#ifdef Py_DEBUG
1183int unicode_ready_calls = 0;
1184#endif
1185
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001186static int
1187unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001189 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 wchar_t *end;
1191 Py_UCS4 maxchar = 0;
1192 Py_ssize_t num_surrogates;
1193#if SIZEOF_WCHAR_T == 2
1194 Py_ssize_t length_wo_surrogates;
1195#endif
1196
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001197 assert(p_obj != NULL);
1198 unicode = (PyUnicodeObject *)*p_obj;
1199
Georg Brandl7597add2011-10-05 16:36:47 +02001200 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001201 strings were created using _PyObject_New() and where no canonical
1202 representation (the str field) has been set yet aka strings
1203 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001204 assert(_PyUnicode_CHECK(unicode));
1205 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001207 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001208 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001209 /* Actually, it should neither be interned nor be anything else: */
1210 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211
1212#ifdef Py_DEBUG
1213 ++unicode_ready_calls;
1214#endif
1215
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001216#ifdef Py_DEBUG
1217 assert(!replace || Py_REFCNT(unicode) == 1);
1218#else
1219 if (replace && Py_REFCNT(unicode) != 1)
1220 replace = 0;
1221#endif
1222 if (replace) {
1223 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1224 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1225 /* Optimization for empty strings */
1226 if (len == 0) {
1227 Py_INCREF(unicode_empty);
1228 Py_DECREF(*p_obj);
1229 *p_obj = unicode_empty;
1230 return 0;
1231 }
1232 if (len == 1 && wstr[0] < 256) {
1233 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1234 if (latin1_char == NULL)
1235 return -1;
1236 Py_DECREF(*p_obj);
1237 *p_obj = latin1_char;
1238 return 0;
1239 }
1240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001243 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001244 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246
1247 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001248 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1249 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 PyErr_NoMemory();
1251 return -1;
1252 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001253 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 _PyUnicode_WSTR(unicode), end,
1255 PyUnicode_1BYTE_DATA(unicode));
1256 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1257 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1258 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1259 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001260 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001261 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001262 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 }
1264 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001265 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001266 _PyUnicode_UTF8(unicode) = NULL;
1267 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 }
1269 PyObject_FREE(_PyUnicode_WSTR(unicode));
1270 _PyUnicode_WSTR(unicode) = NULL;
1271 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1272 }
1273 /* In this case we might have to convert down from 4-byte native
1274 wchar_t to 2-byte unicode. */
1275 else if (maxchar < 65536) {
1276 assert(num_surrogates == 0 &&
1277 "FindMaxCharAndNumSurrogatePairs() messed up");
1278
Victor Stinner506f5922011-09-28 22:34:18 +02001279#if SIZEOF_WCHAR_T == 2
1280 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001281 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001282 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1283 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1284 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001285 _PyUnicode_UTF8(unicode) = NULL;
1286 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001287#else
1288 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001289 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001290 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001291 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001292 PyErr_NoMemory();
1293 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 }
Victor Stinner506f5922011-09-28 22:34:18 +02001295 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1296 _PyUnicode_WSTR(unicode), end,
1297 PyUnicode_2BYTE_DATA(unicode));
1298 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1299 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1300 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001301 _PyUnicode_UTF8(unicode) = NULL;
1302 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001303 PyObject_FREE(_PyUnicode_WSTR(unicode));
1304 _PyUnicode_WSTR(unicode) = NULL;
1305 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1306#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 }
1308 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1309 else {
1310#if SIZEOF_WCHAR_T == 2
1311 /* in case the native representation is 2-bytes, we need to allocate a
1312 new normalized 4-byte version. */
1313 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001314 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1315 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 PyErr_NoMemory();
1317 return -1;
1318 }
1319 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1320 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001321 _PyUnicode_UTF8(unicode) = NULL;
1322 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001323 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1324 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001325 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 PyObject_FREE(_PyUnicode_WSTR(unicode));
1327 _PyUnicode_WSTR(unicode) = NULL;
1328 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1329#else
1330 assert(num_surrogates == 0);
1331
Victor Stinnerc3c74152011-10-02 20:39:55 +02001332 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001334 _PyUnicode_UTF8(unicode) = NULL;
1335 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1337#endif
1338 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1339 }
1340 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001341 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 return 0;
1343}
1344
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001345int
1346_PyUnicode_ReadyReplace(PyObject **op)
1347{
1348 return unicode_ready(op, 1);
1349}
1350
1351int
1352_PyUnicode_Ready(PyObject *op)
1353{
1354 return unicode_ready(&op, 0);
1355}
1356
Alexander Belopolsky40018472011-02-26 01:02:56 +00001357static void
1358unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359{
Walter Dörwald16807132007-05-25 13:52:07 +00001360 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001361 case SSTATE_NOT_INTERNED:
1362 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001363
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 case SSTATE_INTERNED_MORTAL:
1365 /* revive dead object temporarily for DelItem */
1366 Py_REFCNT(unicode) = 3;
1367 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1368 Py_FatalError(
1369 "deletion of interned string failed");
1370 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001371
Benjamin Peterson29060642009-01-31 22:14:21 +00001372 case SSTATE_INTERNED_IMMORTAL:
1373 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001374
Benjamin Peterson29060642009-01-31 22:14:21 +00001375 default:
1376 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001377 }
1378
Victor Stinner03490912011-10-03 23:45:12 +02001379 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001381 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001382 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383
1384 if (PyUnicode_IS_COMPACT(unicode)) {
1385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 }
1387 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 if (_PyUnicode_DATA_ANY(unicode))
1389 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 }
1392}
1393
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001394#ifdef Py_DEBUG
1395static int
1396unicode_is_singleton(PyObject *unicode)
1397{
1398 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1399 if (unicode == unicode_empty)
1400 return 1;
1401 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1402 {
1403 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1404 if (ch < 256 && unicode_latin1[ch] == unicode)
1405 return 1;
1406 }
1407 return 0;
1408}
1409#endif
1410
Alexander Belopolsky40018472011-02-26 01:02:56 +00001411static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001412unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001413{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001414 if (Py_REFCNT(unicode) != 1)
1415 return 0;
1416 if (PyUnicode_CHECK_INTERNED(unicode))
1417 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001418#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001419 /* singleton refcount is greater than 1 */
1420 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001421#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001422 return 1;
1423}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001424
Victor Stinnerfe226c02011-10-03 03:52:20 +02001425static int
1426unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1427{
1428 PyObject *unicode;
1429 Py_ssize_t old_length;
1430
1431 assert(p_unicode != NULL);
1432 unicode = *p_unicode;
1433
1434 assert(unicode != NULL);
1435 assert(PyUnicode_Check(unicode));
1436 assert(0 <= length);
1437
Victor Stinner910337b2011-10-03 03:20:16 +02001438 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001439 old_length = PyUnicode_WSTR_LENGTH(unicode);
1440 else
1441 old_length = PyUnicode_GET_LENGTH(unicode);
1442 if (old_length == length)
1443 return 0;
1444
Victor Stinnerfe226c02011-10-03 03:52:20 +02001445 if (!unicode_resizable(unicode)) {
1446 PyObject *copy = resize_copy(unicode, length);
1447 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001448 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001449 Py_DECREF(*p_unicode);
1450 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001451 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001452 }
1453
Victor Stinnerfe226c02011-10-03 03:52:20 +02001454 if (PyUnicode_IS_COMPACT(unicode)) {
1455 *p_unicode = resize_compact(unicode, length);
1456 if (*p_unicode == NULL)
1457 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001458 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001459 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001460 }
1461 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001462}
1463
Alexander Belopolsky40018472011-02-26 01:02:56 +00001464int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001465PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001466{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001467 PyObject *unicode;
1468 if (p_unicode == NULL) {
1469 PyErr_BadInternalCall();
1470 return -1;
1471 }
1472 unicode = *p_unicode;
1473 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1474 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1475 {
1476 PyErr_BadInternalCall();
1477 return -1;
1478 }
1479 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001480}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482static PyObject*
1483get_latin1_char(unsigned char ch)
1484{
Victor Stinnera464fc12011-10-02 20:39:30 +02001485 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 if (!unicode)
1489 return NULL;
1490 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001491 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 unicode_latin1[ch] = unicode;
1493 }
1494 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001495 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496}
1497
Alexander Belopolsky40018472011-02-26 01:02:56 +00001498PyObject *
1499PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500{
1501 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 Py_UCS4 maxchar = 0;
1503 Py_ssize_t num_surrogates;
1504
1505 if (u == NULL)
1506 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001508 /* If the Unicode data is known at construction time, we can apply
1509 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 /* Optimization for empty strings */
1512 if (size == 0 && unicode_empty != NULL) {
1513 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001514 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001515 }
Tim Petersced69f82003-09-16 20:30:58 +00001516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517 /* Single character Unicode objects in the Latin-1 range are
1518 shared when using this constructor */
1519 if (size == 1 && *u < 256)
1520 return get_latin1_char((unsigned char)*u);
1521
1522 /* If not empty and not single character, copy the Unicode data
1523 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001524 if (find_maxchar_surrogates(u, u + size,
1525 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 return NULL;
1527
1528 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1529 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 if (!unicode)
1531 return NULL;
1532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 switch (PyUnicode_KIND(unicode)) {
1534 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001535 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1537 break;
1538 case PyUnicode_2BYTE_KIND:
1539#if Py_UNICODE_SIZE == 2
1540 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1541#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001542 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1544#endif
1545 break;
1546 case PyUnicode_4BYTE_KIND:
1547#if SIZEOF_WCHAR_T == 2
1548 /* This is the only case which has to process surrogates, thus
1549 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001550 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551#else
1552 assert(num_surrogates == 0);
1553 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1554#endif
1555 break;
1556 default:
1557 assert(0 && "Impossible state");
1558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001560 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 return (PyObject *)unicode;
1562}
1563
Alexander Belopolsky40018472011-02-26 01:02:56 +00001564PyObject *
1565PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001566{
1567 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001568
Benjamin Peterson14339b62009-01-31 16:36:08 +00001569 if (size < 0) {
1570 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001572 return NULL;
1573 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001574
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001575 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001576 some optimizations which share commonly used objects.
1577 Also, this means the input must be UTF-8, so fall back to the
1578 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001579 if (u != NULL) {
1580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 /* Optimization for empty strings */
1582 if (size == 0 && unicode_empty != NULL) {
1583 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001584 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001586
1587 /* Single characters are shared when using this constructor.
1588 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 if (size == 1 && Py_CHARMASK(*u) < 128)
1590 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001591
1592 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001593 }
1594
Walter Dörwald55507312007-05-18 13:12:10 +00001595 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001596 if (!unicode)
1597 return NULL;
1598
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001599 return (PyObject *)unicode;
1600}
1601
Alexander Belopolsky40018472011-02-26 01:02:56 +00001602PyObject *
1603PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001604{
1605 size_t size = strlen(u);
1606 if (size > PY_SSIZE_T_MAX) {
1607 PyErr_SetString(PyExc_OverflowError, "input too long");
1608 return NULL;
1609 }
1610
1611 return PyUnicode_FromStringAndSize(u, size);
1612}
1613
Victor Stinnere57b1c02011-09-28 22:20:48 +02001614static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001615unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001616{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001617 PyObject *res;
1618#ifdef Py_DEBUG
1619 const unsigned char *p;
1620 const unsigned char *end = s + size;
1621 for (p=s; p < end; p++) {
1622 assert(*p < 128);
1623 }
1624#endif
1625 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001626 if (!res)
1627 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001628 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001629 return res;
1630}
1631
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001632static Py_UCS4
1633kind_maxchar_limit(unsigned int kind)
1634{
1635 switch(kind) {
1636 case PyUnicode_1BYTE_KIND:
1637 return 0x80;
1638 case PyUnicode_2BYTE_KIND:
1639 return 0x100;
1640 case PyUnicode_4BYTE_KIND:
1641 return 0x10000;
1642 default:
1643 assert(0 && "invalid kind");
1644 return 0x10ffff;
1645 }
1646}
1647
Victor Stinner702c7342011-10-05 13:50:52 +02001648static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001649_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001652 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001654
1655 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 for (i = 0; i < size; i++) {
1657 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001658 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001660 }
1661 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001662 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 if (!res)
1664 return NULL;
1665 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001666 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001668}
1669
Victor Stinnere57b1c02011-09-28 22:20:48 +02001670static PyObject*
1671_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672{
1673 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001674 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001676
1677 assert(size >= 0);
1678 for (i = 0; i < size; i++) {
1679 if (u[i] > max_char) {
1680 max_char = u[i];
1681 if (max_char >= 256)
1682 break;
1683 }
1684 }
1685 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 if (!res)
1687 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001688 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1690 else
1691 for (i = 0; i < size; i++)
1692 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001693 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 return res;
1695}
1696
Victor Stinnere57b1c02011-09-28 22:20:48 +02001697static PyObject*
1698_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699{
1700 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001701 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001703
1704 assert(size >= 0);
1705 for (i = 0; i < size; i++) {
1706 if (u[i] > max_char) {
1707 max_char = u[i];
1708 if (max_char >= 0x10000)
1709 break;
1710 }
1711 }
1712 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 if (!res)
1714 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001715 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1717 else {
1718 int kind = PyUnicode_KIND(res);
1719 void *data = PyUnicode_DATA(res);
1720 for (i = 0; i < size; i++)
1721 PyUnicode_WRITE(kind, data, i, u[i]);
1722 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001723 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 return res;
1725}
1726
1727PyObject*
1728PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1729{
1730 switch(kind) {
1731 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001732 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001734 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001736 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001737 default:
1738 assert(0 && "invalid kind");
1739 PyErr_SetString(PyExc_SystemError, "invalid kind");
1740 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742}
1743
Victor Stinner25a4b292011-10-06 12:31:55 +02001744/* Ensure that a string uses the most efficient storage, if it is not the
1745 case: create a new string with of the right kind. Write NULL into *p_unicode
1746 on error. */
1747void
1748unicode_adjust_maxchar(PyObject **p_unicode)
1749{
1750 PyObject *unicode, *copy;
1751 Py_UCS4 max_char;
1752 Py_ssize_t i, len;
1753 unsigned int kind;
1754
1755 assert(p_unicode != NULL);
1756 unicode = *p_unicode;
1757 assert(PyUnicode_IS_READY(unicode));
1758 if (PyUnicode_IS_ASCII(unicode))
1759 return;
1760
1761 len = PyUnicode_GET_LENGTH(unicode);
1762 kind = PyUnicode_KIND(unicode);
1763 if (kind == PyUnicode_1BYTE_KIND) {
1764 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1765 for (i = 0; i < len; i++) {
1766 if (u[i] & 0x80)
1767 return;
1768 }
1769 max_char = 127;
1770 }
1771 else if (kind == PyUnicode_2BYTE_KIND) {
1772 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1773 max_char = 0;
1774 for (i = 0; i < len; i++) {
1775 if (u[i] > max_char) {
1776 max_char = u[i];
1777 if (max_char >= 256)
1778 return;
1779 }
1780 }
1781 }
1782 else {
1783 assert(kind == PyUnicode_4BYTE_KIND);
1784 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
1785 max_char = 0;
1786 for (i = 0; i < len; i++) {
1787 if (u[i] > max_char) {
1788 max_char = u[i];
1789 if (max_char >= 0x10000)
1790 return;
1791 }
1792 }
1793 }
1794 assert(max_char > PyUnicode_MAX_CHAR_VALUE(unicode));
1795 copy = PyUnicode_New(len, max_char);
1796 copy_characters(copy, 0, unicode, 0, len);
1797 Py_DECREF(unicode);
1798 *p_unicode = copy;
1799}
1800
Victor Stinner034f6cf2011-09-30 02:26:44 +02001801PyObject*
1802PyUnicode_Copy(PyObject *unicode)
1803{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001804 Py_ssize_t size;
1805 PyObject *copy;
1806 void *data;
1807
Victor Stinner034f6cf2011-09-30 02:26:44 +02001808 if (!PyUnicode_Check(unicode)) {
1809 PyErr_BadInternalCall();
1810 return NULL;
1811 }
1812 if (PyUnicode_READY(unicode))
1813 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001814
1815 size = PyUnicode_GET_LENGTH(unicode);
1816 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1817 if (!copy)
1818 return NULL;
1819 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1820
1821 data = PyUnicode_DATA(unicode);
1822 switch (PyUnicode_KIND(unicode))
1823 {
1824 case PyUnicode_1BYTE_KIND:
1825 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1826 break;
1827 case PyUnicode_2BYTE_KIND:
1828 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1829 break;
1830 case PyUnicode_4BYTE_KIND:
1831 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1832 break;
1833 default:
1834 assert(0);
1835 break;
1836 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001837 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001838 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001839}
1840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841
Victor Stinnerbc603d12011-10-02 01:00:40 +02001842/* Widen Unicode objects to larger buffers. Don't write terminating null
1843 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844
1845void*
1846_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1847{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001848 Py_ssize_t len;
1849 void *result;
1850 unsigned int skind;
1851
1852 if (PyUnicode_READY(s))
1853 return NULL;
1854
1855 len = PyUnicode_GET_LENGTH(s);
1856 skind = PyUnicode_KIND(s);
1857 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001858 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859 return NULL;
1860 }
1861 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001862 case PyUnicode_2BYTE_KIND:
1863 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1864 if (!result)
1865 return PyErr_NoMemory();
1866 assert(skind == PyUnicode_1BYTE_KIND);
1867 _PyUnicode_CONVERT_BYTES(
1868 Py_UCS1, Py_UCS2,
1869 PyUnicode_1BYTE_DATA(s),
1870 PyUnicode_1BYTE_DATA(s) + len,
1871 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001872 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001873 case PyUnicode_4BYTE_KIND:
1874 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1875 if (!result)
1876 return PyErr_NoMemory();
1877 if (skind == PyUnicode_2BYTE_KIND) {
1878 _PyUnicode_CONVERT_BYTES(
1879 Py_UCS2, Py_UCS4,
1880 PyUnicode_2BYTE_DATA(s),
1881 PyUnicode_2BYTE_DATA(s) + len,
1882 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001884 else {
1885 assert(skind == PyUnicode_1BYTE_KIND);
1886 _PyUnicode_CONVERT_BYTES(
1887 Py_UCS1, Py_UCS4,
1888 PyUnicode_1BYTE_DATA(s),
1889 PyUnicode_1BYTE_DATA(s) + len,
1890 result);
1891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001893 default:
1894 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 }
Victor Stinner01698042011-10-04 00:04:26 +02001896 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 return NULL;
1898}
1899
1900static Py_UCS4*
1901as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1902 int copy_null)
1903{
1904 int kind;
1905 void *data;
1906 Py_ssize_t len, targetlen;
1907 if (PyUnicode_READY(string) == -1)
1908 return NULL;
1909 kind = PyUnicode_KIND(string);
1910 data = PyUnicode_DATA(string);
1911 len = PyUnicode_GET_LENGTH(string);
1912 targetlen = len;
1913 if (copy_null)
1914 targetlen++;
1915 if (!target) {
1916 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1917 PyErr_NoMemory();
1918 return NULL;
1919 }
1920 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1921 if (!target) {
1922 PyErr_NoMemory();
1923 return NULL;
1924 }
1925 }
1926 else {
1927 if (targetsize < targetlen) {
1928 PyErr_Format(PyExc_SystemError,
1929 "string is longer than the buffer");
1930 if (copy_null && 0 < targetsize)
1931 target[0] = 0;
1932 return NULL;
1933 }
1934 }
1935 if (kind != PyUnicode_4BYTE_KIND) {
1936 Py_ssize_t i;
1937 for (i = 0; i < len; i++)
1938 target[i] = PyUnicode_READ(kind, data, i);
1939 }
1940 else
1941 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1942 if (copy_null)
1943 target[len] = 0;
1944 return target;
1945}
1946
1947Py_UCS4*
1948PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1949 int copy_null)
1950{
1951 if (target == NULL || targetsize < 1) {
1952 PyErr_BadInternalCall();
1953 return NULL;
1954 }
1955 return as_ucs4(string, target, targetsize, copy_null);
1956}
1957
1958Py_UCS4*
1959PyUnicode_AsUCS4Copy(PyObject *string)
1960{
1961 return as_ucs4(string, NULL, 0, 1);
1962}
1963
1964#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001965
Alexander Belopolsky40018472011-02-26 01:02:56 +00001966PyObject *
1967PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001970 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001972 PyErr_BadInternalCall();
1973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 }
1975
Martin v. Löwis790465f2008-04-05 20:41:37 +00001976 if (size == -1) {
1977 size = wcslen(w);
1978 }
1979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981}
1982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001984
Walter Dörwald346737f2007-05-31 10:44:43 +00001985static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001986makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1987 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001988{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001989 *fmt++ = '%';
1990 if (width) {
1991 if (zeropad)
1992 *fmt++ = '0';
1993 fmt += sprintf(fmt, "%d", width);
1994 }
1995 if (precision)
1996 fmt += sprintf(fmt, ".%d", precision);
1997 if (longflag)
1998 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001999 else if (longlongflag) {
2000 /* longlongflag should only ever be nonzero on machines with
2001 HAVE_LONG_LONG defined */
2002#ifdef HAVE_LONG_LONG
2003 char *f = PY_FORMAT_LONG_LONG;
2004 while (*f)
2005 *fmt++ = *f++;
2006#else
2007 /* we shouldn't ever get here */
2008 assert(0);
2009 *fmt++ = 'l';
2010#endif
2011 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002012 else if (size_tflag) {
2013 char *f = PY_FORMAT_SIZE_T;
2014 while (*f)
2015 *fmt++ = *f++;
2016 }
2017 *fmt++ = c;
2018 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002019}
2020
Victor Stinner96865452011-03-01 23:44:09 +00002021/* helper for PyUnicode_FromFormatV() */
2022
2023static const char*
2024parse_format_flags(const char *f,
2025 int *p_width, int *p_precision,
2026 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2027{
2028 int width, precision, longflag, longlongflag, size_tflag;
2029
2030 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2031 f++;
2032 width = 0;
2033 while (Py_ISDIGIT((unsigned)*f))
2034 width = (width*10) + *f++ - '0';
2035 precision = 0;
2036 if (*f == '.') {
2037 f++;
2038 while (Py_ISDIGIT((unsigned)*f))
2039 precision = (precision*10) + *f++ - '0';
2040 if (*f == '%') {
2041 /* "%.3%s" => f points to "3" */
2042 f--;
2043 }
2044 }
2045 if (*f == '\0') {
2046 /* bogus format "%.1" => go backward, f points to "1" */
2047 f--;
2048 }
2049 if (p_width != NULL)
2050 *p_width = width;
2051 if (p_precision != NULL)
2052 *p_precision = precision;
2053
2054 /* Handle %ld, %lu, %lld and %llu. */
2055 longflag = 0;
2056 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002057 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002058
2059 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002060 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002061 longflag = 1;
2062 ++f;
2063 }
2064#ifdef HAVE_LONG_LONG
2065 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002066 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002067 longlongflag = 1;
2068 f += 2;
2069 }
2070#endif
2071 }
2072 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002073 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002074 size_tflag = 1;
2075 ++f;
2076 }
2077 if (p_longflag != NULL)
2078 *p_longflag = longflag;
2079 if (p_longlongflag != NULL)
2080 *p_longlongflag = longlongflag;
2081 if (p_size_tflag != NULL)
2082 *p_size_tflag = size_tflag;
2083 return f;
2084}
2085
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002086/* maximum number of characters required for output of %ld. 21 characters
2087 allows for 64-bit integers (in decimal) and an optional sign. */
2088#define MAX_LONG_CHARS 21
2089/* maximum number of characters required for output of %lld.
2090 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2091 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2092#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2093
Walter Dörwaldd2034312007-05-18 16:29:38 +00002094PyObject *
2095PyUnicode_FromFormatV(const char *format, va_list vargs)
2096{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002097 va_list count;
2098 Py_ssize_t callcount = 0;
2099 PyObject **callresults = NULL;
2100 PyObject **callresult = NULL;
2101 Py_ssize_t n = 0;
2102 int width = 0;
2103 int precision = 0;
2104 int zeropad;
2105 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002106 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002107 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002108 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002109 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2110 Py_UCS4 argmaxchar;
2111 Py_ssize_t numbersize = 0;
2112 char *numberresults = NULL;
2113 char *numberresult = NULL;
2114 Py_ssize_t i;
2115 int kind;
2116 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002117
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002118 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002119 /* step 1: count the number of %S/%R/%A/%s format specifications
2120 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2121 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002123 * also estimate a upper bound for all the number formats in the string,
2124 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002126 for (f = format; *f; f++) {
2127 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002128 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2130 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2131 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2132 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002135#ifdef HAVE_LONG_LONG
2136 if (longlongflag) {
2137 if (width < MAX_LONG_LONG_CHARS)
2138 width = MAX_LONG_LONG_CHARS;
2139 }
2140 else
2141#endif
2142 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2143 including sign. Decimal takes the most space. This
2144 isn't enough for octal. If a width is specified we
2145 need more (which we allocate later). */
2146 if (width < MAX_LONG_CHARS)
2147 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148
2149 /* account for the size + '\0' to separate numbers
2150 inside of the numberresults buffer */
2151 numbersize += (width + 1);
2152 }
2153 }
2154 else if ((unsigned char)*f > 127) {
2155 PyErr_Format(PyExc_ValueError,
2156 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2157 "string, got a non-ASCII byte: 0x%02x",
2158 (unsigned char)*f);
2159 return NULL;
2160 }
2161 }
2162 /* step 2: allocate memory for the results of
2163 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2164 if (callcount) {
2165 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2166 if (!callresults) {
2167 PyErr_NoMemory();
2168 return NULL;
2169 }
2170 callresult = callresults;
2171 }
2172 /* step 2.5: allocate memory for the results of formating numbers */
2173 if (numbersize) {
2174 numberresults = PyObject_Malloc(numbersize);
2175 if (!numberresults) {
2176 PyErr_NoMemory();
2177 goto fail;
2178 }
2179 numberresult = numberresults;
2180 }
2181
2182 /* step 3: format numbers and figure out how large a buffer we need */
2183 for (f = format; *f; f++) {
2184 if (*f == '%') {
2185 const char* p;
2186 int longflag;
2187 int longlongflag;
2188 int size_tflag;
2189 int numprinted;
2190
2191 p = f;
2192 zeropad = (f[1] == '0');
2193 f = parse_format_flags(f, &width, &precision,
2194 &longflag, &longlongflag, &size_tflag);
2195 switch (*f) {
2196 case 'c':
2197 {
2198 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002199 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 n++;
2201 break;
2202 }
2203 case '%':
2204 n++;
2205 break;
2206 case 'i':
2207 case 'd':
2208 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2209 width, precision, *f);
2210 if (longflag)
2211 numprinted = sprintf(numberresult, fmt,
2212 va_arg(count, long));
2213#ifdef HAVE_LONG_LONG
2214 else if (longlongflag)
2215 numprinted = sprintf(numberresult, fmt,
2216 va_arg(count, PY_LONG_LONG));
2217#endif
2218 else if (size_tflag)
2219 numprinted = sprintf(numberresult, fmt,
2220 va_arg(count, Py_ssize_t));
2221 else
2222 numprinted = sprintf(numberresult, fmt,
2223 va_arg(count, int));
2224 n += numprinted;
2225 /* advance by +1 to skip over the '\0' */
2226 numberresult += (numprinted + 1);
2227 assert(*(numberresult - 1) == '\0');
2228 assert(*(numberresult - 2) != '\0');
2229 assert(numprinted >= 0);
2230 assert(numberresult <= numberresults + numbersize);
2231 break;
2232 case 'u':
2233 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2234 width, precision, 'u');
2235 if (longflag)
2236 numprinted = sprintf(numberresult, fmt,
2237 va_arg(count, unsigned long));
2238#ifdef HAVE_LONG_LONG
2239 else if (longlongflag)
2240 numprinted = sprintf(numberresult, fmt,
2241 va_arg(count, unsigned PY_LONG_LONG));
2242#endif
2243 else if (size_tflag)
2244 numprinted = sprintf(numberresult, fmt,
2245 va_arg(count, size_t));
2246 else
2247 numprinted = sprintf(numberresult, fmt,
2248 va_arg(count, unsigned int));
2249 n += numprinted;
2250 numberresult += (numprinted + 1);
2251 assert(*(numberresult - 1) == '\0');
2252 assert(*(numberresult - 2) != '\0');
2253 assert(numprinted >= 0);
2254 assert(numberresult <= numberresults + numbersize);
2255 break;
2256 case 'x':
2257 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2258 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2259 n += numprinted;
2260 numberresult += (numprinted + 1);
2261 assert(*(numberresult - 1) == '\0');
2262 assert(*(numberresult - 2) != '\0');
2263 assert(numprinted >= 0);
2264 assert(numberresult <= numberresults + numbersize);
2265 break;
2266 case 'p':
2267 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2268 /* %p is ill-defined: ensure leading 0x. */
2269 if (numberresult[1] == 'X')
2270 numberresult[1] = 'x';
2271 else if (numberresult[1] != 'x') {
2272 memmove(numberresult + 2, numberresult,
2273 strlen(numberresult) + 1);
2274 numberresult[0] = '0';
2275 numberresult[1] = 'x';
2276 numprinted += 2;
2277 }
2278 n += numprinted;
2279 numberresult += (numprinted + 1);
2280 assert(*(numberresult - 1) == '\0');
2281 assert(*(numberresult - 2) != '\0');
2282 assert(numprinted >= 0);
2283 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 break;
2285 case 's':
2286 {
2287 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002288 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002289 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2290 if (!str)
2291 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292 /* since PyUnicode_DecodeUTF8 returns already flexible
2293 unicode objects, there is no need to call ready on them */
2294 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002295 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002297 /* Remember the str and switch to the next slot */
2298 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 break;
2300 }
2301 case 'U':
2302 {
2303 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002304 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002305 if (PyUnicode_READY(obj) == -1)
2306 goto fail;
2307 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002308 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002310 break;
2311 }
2312 case 'V':
2313 {
2314 PyObject *obj = va_arg(count, PyObject *);
2315 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002316 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002317 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002318 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002319 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 if (PyUnicode_READY(obj) == -1)
2321 goto fail;
2322 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002323 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002324 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002325 *callresult++ = NULL;
2326 }
2327 else {
2328 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2329 if (!str_obj)
2330 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002331 if (PyUnicode_READY(str_obj)) {
2332 Py_DECREF(str_obj);
2333 goto fail;
2334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002335 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002336 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002338 *callresult++ = str_obj;
2339 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002340 break;
2341 }
2342 case 'S':
2343 {
2344 PyObject *obj = va_arg(count, PyObject *);
2345 PyObject *str;
2346 assert(obj);
2347 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002349 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002351 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 /* Remember the str and switch to the next slot */
2354 *callresult++ = str;
2355 break;
2356 }
2357 case 'R':
2358 {
2359 PyObject *obj = va_arg(count, PyObject *);
2360 PyObject *repr;
2361 assert(obj);
2362 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002364 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002366 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002368 /* Remember the repr and switch to the next slot */
2369 *callresult++ = repr;
2370 break;
2371 }
2372 case 'A':
2373 {
2374 PyObject *obj = va_arg(count, PyObject *);
2375 PyObject *ascii;
2376 assert(obj);
2377 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002381 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002383 /* Remember the repr and switch to the next slot */
2384 *callresult++ = ascii;
2385 break;
2386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 default:
2388 /* if we stumble upon an unknown
2389 formatting code, copy the rest of
2390 the format string to the output
2391 string. (we cannot just skip the
2392 code, since there's no way to know
2393 what's in the argument list) */
2394 n += strlen(p);
2395 goto expand;
2396 }
2397 } else
2398 n++;
2399 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002400 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 we don't have to resize the string.
2404 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002405 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002406 if (!string)
2407 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 kind = PyUnicode_KIND(string);
2409 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002410 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002414 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002415 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002416
2417 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2419 /* checking for == because the last argument could be a empty
2420 string, which causes i to point to end, the assert at the end of
2421 the loop */
2422 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002423
Benjamin Peterson14339b62009-01-31 16:36:08 +00002424 switch (*f) {
2425 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002426 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 const int ordinal = va_arg(vargs, int);
2428 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002429 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002430 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002431 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002432 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002433 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 case 'p':
2436 /* unused, since we already have the result */
2437 if (*f == 'p')
2438 (void) va_arg(vargs, void *);
2439 else
2440 (void) va_arg(vargs, int);
2441 /* extract the result from numberresults and append. */
2442 for (; *numberresult; ++i, ++numberresult)
2443 PyUnicode_WRITE(kind, data, i, *numberresult);
2444 /* skip over the separating '\0' */
2445 assert(*numberresult == '\0');
2446 numberresult++;
2447 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002448 break;
2449 case 's':
2450 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002451 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002453 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 size = PyUnicode_GET_LENGTH(*callresult);
2455 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002456 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002458 /* We're done with the unicode()/repr() => forget it */
2459 Py_DECREF(*callresult);
2460 /* switch to next unicode()/repr() result */
2461 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002462 break;
2463 }
2464 case 'U':
2465 {
2466 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 Py_ssize_t size;
2468 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2469 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002470 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002472 break;
2473 }
2474 case 'V':
2475 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002478 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002479 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 size = PyUnicode_GET_LENGTH(obj);
2481 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002482 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002484 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 size = PyUnicode_GET_LENGTH(*callresult);
2486 assert(PyUnicode_KIND(*callresult) <=
2487 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002488 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002490 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002491 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002492 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002493 break;
2494 }
2495 case 'S':
2496 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002497 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002498 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002499 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 /* unused, since we already have the result */
2501 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002503 copy_characters(string, i, *callresult, 0, size);
2504 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 /* We're done with the unicode()/repr() => forget it */
2506 Py_DECREF(*callresult);
2507 /* switch to next unicode()/repr() result */
2508 ++callresult;
2509 break;
2510 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002513 break;
2514 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 for (; *p; ++p, ++i)
2516 PyUnicode_WRITE(kind, data, i, *p);
2517 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 goto end;
2519 }
Victor Stinner1205f272010-09-11 00:54:47 +00002520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 else {
2522 assert(i < PyUnicode_GET_LENGTH(string));
2523 PyUnicode_WRITE(kind, data, i++, *f);
2524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002527
Benjamin Peterson29060642009-01-31 22:14:21 +00002528 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002529 if (callresults)
2530 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 if (numberresults)
2532 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002533 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002535 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002536 if (callresults) {
2537 PyObject **callresult2 = callresults;
2538 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002539 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002540 ++callresult2;
2541 }
2542 PyObject_Free(callresults);
2543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002544 if (numberresults)
2545 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002547}
2548
Walter Dörwaldd2034312007-05-18 16:29:38 +00002549PyObject *
2550PyUnicode_FromFormat(const char *format, ...)
2551{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002552 PyObject* ret;
2553 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002554
2555#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002557#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002559#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 ret = PyUnicode_FromFormatV(format, vargs);
2561 va_end(vargs);
2562 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002563}
2564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565#ifdef HAVE_WCHAR_H
2566
Victor Stinner5593d8a2010-10-02 11:11:27 +00002567/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2568 convert a Unicode object to a wide character string.
2569
Victor Stinnerd88d9832011-09-06 02:00:05 +02002570 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002571 character) required to convert the unicode object. Ignore size argument.
2572
Victor Stinnerd88d9832011-09-06 02:00:05 +02002573 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002574 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002575 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002576static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002577unicode_aswidechar(PyUnicodeObject *unicode,
2578 wchar_t *w,
2579 Py_ssize_t size)
2580{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002581 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 const wchar_t *wstr;
2583
2584 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2585 if (wstr == NULL)
2586 return -1;
2587
Victor Stinner5593d8a2010-10-02 11:11:27 +00002588 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002589 if (size > res)
2590 size = res + 1;
2591 else
2592 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002594 return res;
2595 }
2596 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002598}
2599
2600Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002601PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002602 wchar_t *w,
2603 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604{
2605 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002606 PyErr_BadInternalCall();
2607 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002609 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610}
2611
Victor Stinner137c34c2010-09-29 10:25:54 +00002612wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002613PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002614 Py_ssize_t *size)
2615{
2616 wchar_t* buffer;
2617 Py_ssize_t buflen;
2618
2619 if (unicode == NULL) {
2620 PyErr_BadInternalCall();
2621 return NULL;
2622 }
2623
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002624 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 if (buflen == -1)
2626 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002627 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002628 PyErr_NoMemory();
2629 return NULL;
2630 }
2631
Victor Stinner137c34c2010-09-29 10:25:54 +00002632 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2633 if (buffer == NULL) {
2634 PyErr_NoMemory();
2635 return NULL;
2636 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002637 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 if (buflen == -1)
2639 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002640 if (size != NULL)
2641 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002642 return buffer;
2643}
2644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646
Alexander Belopolsky40018472011-02-26 01:02:56 +00002647PyObject *
2648PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002651 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 PyErr_SetString(PyExc_ValueError,
2653 "chr() arg not in range(0x110000)");
2654 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002655 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 if (ordinal < 256)
2658 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660 v = PyUnicode_New(1, ordinal);
2661 if (v == NULL)
2662 return NULL;
2663 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002664 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002665 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002666}
2667
Alexander Belopolsky40018472011-02-26 01:02:56 +00002668PyObject *
2669PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002671 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002672 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002673 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002674 if (PyUnicode_READY(obj))
2675 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 Py_INCREF(obj);
2677 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002678 }
2679 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 /* For a Unicode subtype that's not a Unicode object,
2681 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002682 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002683 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002684 PyErr_Format(PyExc_TypeError,
2685 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002686 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002687 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002688}
2689
Alexander Belopolsky40018472011-02-26 01:02:56 +00002690PyObject *
2691PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002692 const char *encoding,
2693 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002694{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002695 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002696 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002697
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002699 PyErr_BadInternalCall();
2700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002702
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002703 /* Decoding bytes objects is the most common case and should be fast */
2704 if (PyBytes_Check(obj)) {
2705 if (PyBytes_GET_SIZE(obj) == 0) {
2706 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002707 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002708 }
2709 else {
2710 v = PyUnicode_Decode(
2711 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2712 encoding, errors);
2713 }
2714 return v;
2715 }
2716
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002717 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 PyErr_SetString(PyExc_TypeError,
2719 "decoding str is not supported");
2720 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002722
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002723 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2724 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2725 PyErr_Format(PyExc_TypeError,
2726 "coercing to str: need bytes, bytearray "
2727 "or buffer-like object, %.80s found",
2728 Py_TYPE(obj)->tp_name);
2729 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002730 }
Tim Petersced69f82003-09-16 20:30:58 +00002731
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002732 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002733 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002734 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 }
Tim Petersced69f82003-09-16 20:30:58 +00002736 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002737 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002738
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002739 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002740 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741}
2742
Victor Stinner600d3be2010-06-10 12:00:55 +00002743/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002744 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2745 1 on success. */
2746static int
2747normalize_encoding(const char *encoding,
2748 char *lower,
2749 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002751 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002752 char *l;
2753 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002755 e = encoding;
2756 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002757 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002758 while (*e) {
2759 if (l == l_end)
2760 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002761 if (Py_ISUPPER(*e)) {
2762 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002763 }
2764 else if (*e == '_') {
2765 *l++ = '-';
2766 e++;
2767 }
2768 else {
2769 *l++ = *e++;
2770 }
2771 }
2772 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002773 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002774}
2775
Alexander Belopolsky40018472011-02-26 01:02:56 +00002776PyObject *
2777PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002778 Py_ssize_t size,
2779 const char *encoding,
2780 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002781{
2782 PyObject *buffer = NULL, *unicode;
2783 Py_buffer info;
2784 char lower[11]; /* Enough for any encoding shortcut */
2785
2786 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002787 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002788
2789 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002790 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002791 if ((strcmp(lower, "utf-8") == 0) ||
2792 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002793 return PyUnicode_DecodeUTF8(s, size, errors);
2794 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002795 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002796 (strcmp(lower, "iso-8859-1") == 0))
2797 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002798#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002799 else if (strcmp(lower, "mbcs") == 0)
2800 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002801#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002802 else if (strcmp(lower, "ascii") == 0)
2803 return PyUnicode_DecodeASCII(s, size, errors);
2804 else if (strcmp(lower, "utf-16") == 0)
2805 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2806 else if (strcmp(lower, "utf-32") == 0)
2807 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809
2810 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002811 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002812 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002813 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002814 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 if (buffer == NULL)
2816 goto onError;
2817 unicode = PyCodec_Decode(buffer, encoding, errors);
2818 if (unicode == NULL)
2819 goto onError;
2820 if (!PyUnicode_Check(unicode)) {
2821 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002822 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002823 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 Py_DECREF(unicode);
2825 goto onError;
2826 }
2827 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002828#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002829 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002830 Py_DECREF(unicode);
2831 return NULL;
2832 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002833#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002834 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002836
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 Py_XDECREF(buffer);
2839 return NULL;
2840}
2841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842PyObject *
2843PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002844 const char *encoding,
2845 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002846{
2847 PyObject *v;
2848
2849 if (!PyUnicode_Check(unicode)) {
2850 PyErr_BadArgument();
2851 goto onError;
2852 }
2853
2854 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002856
2857 /* Decode via the codec registry */
2858 v = PyCodec_Decode(unicode, encoding, errors);
2859 if (v == NULL)
2860 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002861 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002862 return v;
2863
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002865 return NULL;
2866}
2867
Alexander Belopolsky40018472011-02-26 01:02:56 +00002868PyObject *
2869PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002870 const char *encoding,
2871 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002872{
2873 PyObject *v;
2874
2875 if (!PyUnicode_Check(unicode)) {
2876 PyErr_BadArgument();
2877 goto onError;
2878 }
2879
2880 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002882
2883 /* Decode via the codec registry */
2884 v = PyCodec_Decode(unicode, encoding, errors);
2885 if (v == NULL)
2886 goto onError;
2887 if (!PyUnicode_Check(v)) {
2888 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002889 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002890 Py_TYPE(v)->tp_name);
2891 Py_DECREF(v);
2892 goto onError;
2893 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002894 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002895 return v;
2896
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002898 return NULL;
2899}
2900
Alexander Belopolsky40018472011-02-26 01:02:56 +00002901PyObject *
2902PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002903 Py_ssize_t size,
2904 const char *encoding,
2905 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906{
2907 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002908
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 unicode = PyUnicode_FromUnicode(s, size);
2910 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2913 Py_DECREF(unicode);
2914 return v;
2915}
2916
Alexander Belopolsky40018472011-02-26 01:02:56 +00002917PyObject *
2918PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002919 const char *encoding,
2920 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002921{
2922 PyObject *v;
2923
2924 if (!PyUnicode_Check(unicode)) {
2925 PyErr_BadArgument();
2926 goto onError;
2927 }
2928
2929 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002931
2932 /* Encode via the codec registry */
2933 v = PyCodec_Encode(unicode, encoding, errors);
2934 if (v == NULL)
2935 goto onError;
2936 return v;
2937
Benjamin Peterson29060642009-01-31 22:14:21 +00002938 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002939 return NULL;
2940}
2941
Victor Stinnerad158722010-10-27 00:25:46 +00002942PyObject *
2943PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002944{
Victor Stinner99b95382011-07-04 14:23:54 +02002945#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002946 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2947 PyUnicode_GET_SIZE(unicode),
2948 NULL);
2949#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002950 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002951#else
Victor Stinner793b5312011-04-27 00:24:21 +02002952 PyInterpreterState *interp = PyThreadState_GET()->interp;
2953 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2954 cannot use it to encode and decode filenames before it is loaded. Load
2955 the Python codec requires to encode at least its own filename. Use the C
2956 version of the locale codec until the codec registry is initialized and
2957 the Python codec is loaded.
2958
2959 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2960 cannot only rely on it: check also interp->fscodec_initialized for
2961 subinterpreters. */
2962 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002963 return PyUnicode_AsEncodedString(unicode,
2964 Py_FileSystemDefaultEncoding,
2965 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002966 }
2967 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002968 /* locale encoding with surrogateescape */
2969 wchar_t *wchar;
2970 char *bytes;
2971 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002972 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002973
2974 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2975 if (wchar == NULL)
2976 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002977 bytes = _Py_wchar2char(wchar, &error_pos);
2978 if (bytes == NULL) {
2979 if (error_pos != (size_t)-1) {
2980 char *errmsg = strerror(errno);
2981 PyObject *exc = NULL;
2982 if (errmsg == NULL)
2983 errmsg = "Py_wchar2char() failed";
2984 raise_encode_exception(&exc,
2985 "filesystemencoding",
2986 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2987 error_pos, error_pos+1,
2988 errmsg);
2989 Py_XDECREF(exc);
2990 }
2991 else
2992 PyErr_NoMemory();
2993 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002994 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002995 }
2996 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002997
2998 bytes_obj = PyBytes_FromString(bytes);
2999 PyMem_Free(bytes);
3000 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003001 }
Victor Stinnerad158722010-10-27 00:25:46 +00003002#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003003}
3004
Alexander Belopolsky40018472011-02-26 01:02:56 +00003005PyObject *
3006PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003007 const char *encoding,
3008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009{
3010 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003011 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003012
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 if (!PyUnicode_Check(unicode)) {
3014 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 }
Fred Drakee4315f52000-05-09 19:53:39 +00003017
Victor Stinner2f283c22011-03-02 01:21:46 +00003018 if (encoding == NULL) {
3019 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003021 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003022 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003023 }
Fred Drakee4315f52000-05-09 19:53:39 +00003024
3025 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003026 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003027 if ((strcmp(lower, "utf-8") == 0) ||
3028 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003029 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003030 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003032 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003033 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003034 }
Victor Stinner37296e82010-06-10 13:36:23 +00003035 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003036 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003037 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003038 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003039#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003040 else if (strcmp(lower, "mbcs") == 0)
3041 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3042 PyUnicode_GET_SIZE(unicode),
3043 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003044#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003045 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003046 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003047 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048
3049 /* Encode via the codec registry */
3050 v = PyCodec_Encode(unicode, encoding, errors);
3051 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003052 return NULL;
3053
3054 /* The normal path */
3055 if (PyBytes_Check(v))
3056 return v;
3057
3058 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003059 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003060 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003061 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003062
3063 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3064 "encoder %s returned bytearray instead of bytes",
3065 encoding);
3066 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003067 Py_DECREF(v);
3068 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003069 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003070
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003071 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3072 Py_DECREF(v);
3073 return b;
3074 }
3075
3076 PyErr_Format(PyExc_TypeError,
3077 "encoder did not return a bytes object (type=%.400s)",
3078 Py_TYPE(v)->tp_name);
3079 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003080 return NULL;
3081}
3082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083PyObject *
3084PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003085 const char *encoding,
3086 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003087{
3088 PyObject *v;
3089
3090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
3092 goto onError;
3093 }
3094
3095 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003097
3098 /* Encode via the codec registry */
3099 v = PyCodec_Encode(unicode, encoding, errors);
3100 if (v == NULL)
3101 goto onError;
3102 if (!PyUnicode_Check(v)) {
3103 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003104 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003105 Py_TYPE(v)->tp_name);
3106 Py_DECREF(v);
3107 goto onError;
3108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 return NULL;
3113}
3114
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003115PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003116PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003117 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003118 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3119}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003120
Christian Heimes5894ba72007-11-04 11:43:14 +00003121PyObject*
3122PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3123{
Victor Stinner99b95382011-07-04 14:23:54 +02003124#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003125 return PyUnicode_DecodeMBCS(s, size, NULL);
3126#elif defined(__APPLE__)
3127 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3128#else
Victor Stinner793b5312011-04-27 00:24:21 +02003129 PyInterpreterState *interp = PyThreadState_GET()->interp;
3130 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3131 cannot use it to encode and decode filenames before it is loaded. Load
3132 the Python codec requires to encode at least its own filename. Use the C
3133 version of the locale codec until the codec registry is initialized and
3134 the Python codec is loaded.
3135
3136 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3137 cannot only rely on it: check also interp->fscodec_initialized for
3138 subinterpreters. */
3139 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003140 return PyUnicode_Decode(s, size,
3141 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003142 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003143 }
3144 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003145 /* locale encoding with surrogateescape */
3146 wchar_t *wchar;
3147 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003148 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003149
3150 if (s[size] != '\0' || size != strlen(s)) {
3151 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3152 return NULL;
3153 }
3154
Victor Stinner168e1172010-10-16 23:16:16 +00003155 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003156 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003157 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003158
Victor Stinner168e1172010-10-16 23:16:16 +00003159 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003160 PyMem_Free(wchar);
3161 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003162 }
Victor Stinnerad158722010-10-27 00:25:46 +00003163#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003164}
3165
Martin v. Löwis011e8422009-05-05 04:43:17 +00003166
3167int
3168PyUnicode_FSConverter(PyObject* arg, void* addr)
3169{
3170 PyObject *output = NULL;
3171 Py_ssize_t size;
3172 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003173 if (arg == NULL) {
3174 Py_DECREF(*(PyObject**)addr);
3175 return 1;
3176 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003177 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003178 output = arg;
3179 Py_INCREF(output);
3180 }
3181 else {
3182 arg = PyUnicode_FromObject(arg);
3183 if (!arg)
3184 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003185 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003186 Py_DECREF(arg);
3187 if (!output)
3188 return 0;
3189 if (!PyBytes_Check(output)) {
3190 Py_DECREF(output);
3191 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3192 return 0;
3193 }
3194 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003195 size = PyBytes_GET_SIZE(output);
3196 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003197 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003198 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003199 Py_DECREF(output);
3200 return 0;
3201 }
3202 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003203 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003204}
3205
3206
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003207int
3208PyUnicode_FSDecoder(PyObject* arg, void* addr)
3209{
3210 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003211 if (arg == NULL) {
3212 Py_DECREF(*(PyObject**)addr);
3213 return 1;
3214 }
3215 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003216 if (PyUnicode_READY(arg))
3217 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003218 output = arg;
3219 Py_INCREF(output);
3220 }
3221 else {
3222 arg = PyBytes_FromObject(arg);
3223 if (!arg)
3224 return 0;
3225 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3226 PyBytes_GET_SIZE(arg));
3227 Py_DECREF(arg);
3228 if (!output)
3229 return 0;
3230 if (!PyUnicode_Check(output)) {
3231 Py_DECREF(output);
3232 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3233 return 0;
3234 }
3235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003236 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3237 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003238 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3239 Py_DECREF(output);
3240 return 0;
3241 }
3242 *(PyObject**)addr = output;
3243 return Py_CLEANUP_SUPPORTED;
3244}
3245
3246
Martin v. Löwis5b222132007-06-10 09:51:05 +00003247char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003248PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003249{
Christian Heimesf3863112007-11-22 07:46:41 +00003250 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003251 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3252
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003253 if (!PyUnicode_Check(unicode)) {
3254 PyErr_BadArgument();
3255 return NULL;
3256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003257 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003258 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003259
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003260 if (PyUnicode_UTF8(unicode) == NULL) {
3261 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003262 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3263 if (bytes == NULL)
3264 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003265 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3266 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003267 Py_DECREF(bytes);
3268 return NULL;
3269 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003270 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3271 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003272 Py_DECREF(bytes);
3273 }
3274
3275 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003276 *psize = PyUnicode_UTF8_LENGTH(unicode);
3277 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003278}
3279
3280char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003281PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003283 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3284}
3285
3286#ifdef Py_DEBUG
3287int unicode_as_unicode_calls = 0;
3288#endif
3289
3290
3291Py_UNICODE *
3292PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3293{
3294 PyUnicodeObject *u;
3295 const unsigned char *one_byte;
3296#if SIZEOF_WCHAR_T == 4
3297 const Py_UCS2 *two_bytes;
3298#else
3299 const Py_UCS4 *four_bytes;
3300 const Py_UCS4 *ucs4_end;
3301 Py_ssize_t num_surrogates;
3302#endif
3303 wchar_t *w;
3304 wchar_t *wchar_end;
3305
3306 if (!PyUnicode_Check(unicode)) {
3307 PyErr_BadArgument();
3308 return NULL;
3309 }
3310 u = (PyUnicodeObject*)unicode;
3311 if (_PyUnicode_WSTR(u) == NULL) {
3312 /* Non-ASCII compact unicode object */
3313 assert(_PyUnicode_KIND(u) != 0);
3314 assert(PyUnicode_IS_READY(u));
3315
3316#ifdef Py_DEBUG
3317 ++unicode_as_unicode_calls;
3318#endif
3319
3320 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3321#if SIZEOF_WCHAR_T == 2
3322 four_bytes = PyUnicode_4BYTE_DATA(u);
3323 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3324 num_surrogates = 0;
3325
3326 for (; four_bytes < ucs4_end; ++four_bytes) {
3327 if (*four_bytes > 0xFFFF)
3328 ++num_surrogates;
3329 }
3330
3331 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3332 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3333 if (!_PyUnicode_WSTR(u)) {
3334 PyErr_NoMemory();
3335 return NULL;
3336 }
3337 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3338
3339 w = _PyUnicode_WSTR(u);
3340 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3341 four_bytes = PyUnicode_4BYTE_DATA(u);
3342 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3343 if (*four_bytes > 0xFFFF) {
3344 /* encode surrogate pair in this case */
3345 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3346 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3347 }
3348 else
3349 *w = *four_bytes;
3350
3351 if (w > wchar_end) {
3352 assert(0 && "Miscalculated string end");
3353 }
3354 }
3355 *w = 0;
3356#else
3357 /* sizeof(wchar_t) == 4 */
3358 Py_FatalError("Impossible unicode object state, wstr and str "
3359 "should share memory already.");
3360 return NULL;
3361#endif
3362 }
3363 else {
3364 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3365 (_PyUnicode_LENGTH(u) + 1));
3366 if (!_PyUnicode_WSTR(u)) {
3367 PyErr_NoMemory();
3368 return NULL;
3369 }
3370 if (!PyUnicode_IS_COMPACT_ASCII(u))
3371 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3372 w = _PyUnicode_WSTR(u);
3373 wchar_end = w + _PyUnicode_LENGTH(u);
3374
3375 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3376 one_byte = PyUnicode_1BYTE_DATA(u);
3377 for (; w < wchar_end; ++one_byte, ++w)
3378 *w = *one_byte;
3379 /* null-terminate the wstr */
3380 *w = 0;
3381 }
3382 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3383#if SIZEOF_WCHAR_T == 4
3384 two_bytes = PyUnicode_2BYTE_DATA(u);
3385 for (; w < wchar_end; ++two_bytes, ++w)
3386 *w = *two_bytes;
3387 /* null-terminate the wstr */
3388 *w = 0;
3389#else
3390 /* sizeof(wchar_t) == 2 */
3391 PyObject_FREE(_PyUnicode_WSTR(u));
3392 _PyUnicode_WSTR(u) = NULL;
3393 Py_FatalError("Impossible unicode object state, wstr "
3394 "and str should share memory already.");
3395 return NULL;
3396#endif
3397 }
3398 else {
3399 assert(0 && "This should never happen.");
3400 }
3401 }
3402 }
3403 if (size != NULL)
3404 *size = PyUnicode_WSTR_LENGTH(u);
3405 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003406}
3407
Alexander Belopolsky40018472011-02-26 01:02:56 +00003408Py_UNICODE *
3409PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003411 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412}
3413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414
Alexander Belopolsky40018472011-02-26 01:02:56 +00003415Py_ssize_t
3416PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417{
3418 if (!PyUnicode_Check(unicode)) {
3419 PyErr_BadArgument();
3420 goto onError;
3421 }
3422 return PyUnicode_GET_SIZE(unicode);
3423
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return -1;
3426}
3427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003428Py_ssize_t
3429PyUnicode_GetLength(PyObject *unicode)
3430{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003431 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003432 PyErr_BadArgument();
3433 return -1;
3434 }
3435
3436 return PyUnicode_GET_LENGTH(unicode);
3437}
3438
3439Py_UCS4
3440PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3441{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003442 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3443 PyErr_BadArgument();
3444 return (Py_UCS4)-1;
3445 }
3446 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3447 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448 return (Py_UCS4)-1;
3449 }
3450 return PyUnicode_READ_CHAR(unicode, index);
3451}
3452
3453int
3454PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3455{
3456 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003457 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458 return -1;
3459 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003460 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3461 PyErr_SetString(PyExc_IndexError, "string index out of range");
3462 return -1;
3463 }
3464 if (_PyUnicode_Dirty(unicode))
3465 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003466 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3467 index, ch);
3468 return 0;
3469}
3470
Alexander Belopolsky40018472011-02-26 01:02:56 +00003471const char *
3472PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003473{
Victor Stinner42cb4622010-09-01 19:39:01 +00003474 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003475}
3476
Victor Stinner554f3f02010-06-16 23:33:54 +00003477/* create or adjust a UnicodeDecodeError */
3478static void
3479make_decode_exception(PyObject **exceptionObject,
3480 const char *encoding,
3481 const char *input, Py_ssize_t length,
3482 Py_ssize_t startpos, Py_ssize_t endpos,
3483 const char *reason)
3484{
3485 if (*exceptionObject == NULL) {
3486 *exceptionObject = PyUnicodeDecodeError_Create(
3487 encoding, input, length, startpos, endpos, reason);
3488 }
3489 else {
3490 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3491 goto onError;
3492 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3493 goto onError;
3494 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3495 goto onError;
3496 }
3497 return;
3498
3499onError:
3500 Py_DECREF(*exceptionObject);
3501 *exceptionObject = NULL;
3502}
3503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504/* error handling callback helper:
3505 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003506 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 and adjust various state variables.
3508 return 0 on success, -1 on error
3509*/
3510
Alexander Belopolsky40018472011-02-26 01:02:56 +00003511static int
3512unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003513 const char *encoding, const char *reason,
3514 const char **input, const char **inend, Py_ssize_t *startinpos,
3515 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3516 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003518 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519
3520 PyObject *restuple = NULL;
3521 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003522 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003523 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 Py_ssize_t requiredsize;
3525 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003526 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003527 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003528 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 int res = -1;
3530
3531 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003532 *errorHandler = PyCodec_LookupError(errors);
3533 if (*errorHandler == NULL)
3534 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 }
3536
Victor Stinner554f3f02010-06-16 23:33:54 +00003537 make_decode_exception(exceptionObject,
3538 encoding,
3539 *input, *inend - *input,
3540 *startinpos, *endinpos,
3541 reason);
3542 if (*exceptionObject == NULL)
3543 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544
3545 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3546 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003547 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003549 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 }
3552 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003554
3555 /* Copy back the bytes variables, which might have been modified by the
3556 callback */
3557 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3558 if (!inputobj)
3559 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003560 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003562 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003563 *input = PyBytes_AS_STRING(inputobj);
3564 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003565 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003566 /* we can DECREF safely, as the exception has another reference,
3567 so the object won't go away. */
3568 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003572 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003573 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3574 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576
3577 /* need more space? (at least enough for what we
3578 have+the replacement+the rest of the string (starting
3579 at the new input position), so we won't have to check space
3580 when there are no errors in the rest of the string) */
3581 repptr = PyUnicode_AS_UNICODE(repunicode);
3582 repsize = PyUnicode_GET_SIZE(repunicode);
3583 requiredsize = *outpos + repsize + insize-newpos;
3584 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 if (requiredsize<2*outsize)
3586 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003587 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 goto onError;
3589 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 }
3591 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003592 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 Py_UNICODE_COPY(*outptr, repptr, repsize);
3594 *outptr += repsize;
3595 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 /* we made it! */
3598 res = 0;
3599
Benjamin Peterson29060642009-01-31 22:14:21 +00003600 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_XDECREF(restuple);
3602 return res;
3603}
3604
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003605/* --- UTF-7 Codec -------------------------------------------------------- */
3606
Antoine Pitrou244651a2009-05-04 18:56:13 +00003607/* See RFC2152 for details. We encode conservatively and decode liberally. */
3608
3609/* Three simple macros defining base-64. */
3610
3611/* Is c a base-64 character? */
3612
3613#define IS_BASE64(c) \
3614 (((c) >= 'A' && (c) <= 'Z') || \
3615 ((c) >= 'a' && (c) <= 'z') || \
3616 ((c) >= '0' && (c) <= '9') || \
3617 (c) == '+' || (c) == '/')
3618
3619/* given that c is a base-64 character, what is its base-64 value? */
3620
3621#define FROM_BASE64(c) \
3622 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3623 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3624 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3625 (c) == '+' ? 62 : 63)
3626
3627/* What is the base-64 character of the bottom 6 bits of n? */
3628
3629#define TO_BASE64(n) \
3630 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3631
3632/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3633 * decoded as itself. We are permissive on decoding; the only ASCII
3634 * byte not decoding to itself is the + which begins a base64
3635 * string. */
3636
3637#define DECODE_DIRECT(c) \
3638 ((c) <= 127 && (c) != '+')
3639
3640/* The UTF-7 encoder treats ASCII characters differently according to
3641 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3642 * the above). See RFC2152. This array identifies these different
3643 * sets:
3644 * 0 : "Set D"
3645 * alphanumeric and '(),-./:?
3646 * 1 : "Set O"
3647 * !"#$%&*;<=>@[]^_`{|}
3648 * 2 : "whitespace"
3649 * ht nl cr sp
3650 * 3 : special (must be base64 encoded)
3651 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3652 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003653
Tim Petersced69f82003-09-16 20:30:58 +00003654static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003655char utf7_category[128] = {
3656/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3657 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3658/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3659 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3660/* sp ! " # $ % & ' ( ) * + , - . / */
3661 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3662/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3664/* @ A B C D E F G H I J K L M N O */
3665 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3666/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3668/* ` a b c d e f g h i j k l m n o */
3669 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3670/* p q r s t u v w x y z { | } ~ del */
3671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003672};
3673
Antoine Pitrou244651a2009-05-04 18:56:13 +00003674/* ENCODE_DIRECT: this character should be encoded as itself. The
3675 * answer depends on whether we are encoding set O as itself, and also
3676 * on whether we are encoding whitespace as itself. RFC2152 makes it
3677 * clear that the answers to these questions vary between
3678 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003679
Antoine Pitrou244651a2009-05-04 18:56:13 +00003680#define ENCODE_DIRECT(c, directO, directWS) \
3681 ((c) < 128 && (c) > 0 && \
3682 ((utf7_category[(c)] == 0) || \
3683 (directWS && (utf7_category[(c)] == 2)) || \
3684 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003685
Alexander Belopolsky40018472011-02-26 01:02:56 +00003686PyObject *
3687PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003688 Py_ssize_t size,
3689 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003690{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003691 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3692}
3693
Antoine Pitrou244651a2009-05-04 18:56:13 +00003694/* The decoder. The only state we preserve is our read position,
3695 * i.e. how many characters we have consumed. So if we end in the
3696 * middle of a shift sequence we have to back off the read position
3697 * and the output to the beginning of the sequence, otherwise we lose
3698 * all the shift state (seen bits, number of bits seen, high
3699 * surrogate). */
3700
Alexander Belopolsky40018472011-02-26 01:02:56 +00003701PyObject *
3702PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003703 Py_ssize_t size,
3704 const char *errors,
3705 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003708 Py_ssize_t startinpos;
3709 Py_ssize_t endinpos;
3710 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003711 const char *e;
3712 PyUnicodeObject *unicode;
3713 Py_UNICODE *p;
3714 const char *errmsg = "";
3715 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003716 Py_UNICODE *shiftOutStart;
3717 unsigned int base64bits = 0;
3718 unsigned long base64buffer = 0;
3719 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 PyObject *errorHandler = NULL;
3721 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003722
3723 unicode = _PyUnicode_New(size);
3724 if (!unicode)
3725 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003726 if (size == 0) {
3727 if (consumed)
3728 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003729 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003730 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003732 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003733 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003734 e = s + size;
3735
3736 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003738 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003739 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003740
Antoine Pitrou244651a2009-05-04 18:56:13 +00003741 if (inShift) { /* in a base-64 section */
3742 if (IS_BASE64(ch)) { /* consume a base-64 character */
3743 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3744 base64bits += 6;
3745 s++;
3746 if (base64bits >= 16) {
3747 /* we have enough bits for a UTF-16 value */
3748 Py_UNICODE outCh = (Py_UNICODE)
3749 (base64buffer >> (base64bits-16));
3750 base64bits -= 16;
3751 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3752 if (surrogate) {
3753 /* expecting a second surrogate */
3754 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3755#ifdef Py_UNICODE_WIDE
3756 *p++ = (((surrogate & 0x3FF)<<10)
3757 | (outCh & 0x3FF)) + 0x10000;
3758#else
3759 *p++ = surrogate;
3760 *p++ = outCh;
3761#endif
3762 surrogate = 0;
3763 }
3764 else {
3765 surrogate = 0;
3766 errmsg = "second surrogate missing";
3767 goto utf7Error;
3768 }
3769 }
3770 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3771 /* first surrogate */
3772 surrogate = outCh;
3773 }
3774 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3775 errmsg = "unexpected second surrogate";
3776 goto utf7Error;
3777 }
3778 else {
3779 *p++ = outCh;
3780 }
3781 }
3782 }
3783 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003784 inShift = 0;
3785 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003786 if (surrogate) {
3787 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003788 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003789 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003790 if (base64bits > 0) { /* left-over bits */
3791 if (base64bits >= 6) {
3792 /* We've seen at least one base-64 character */
3793 errmsg = "partial character in shift sequence";
3794 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003795 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003796 else {
3797 /* Some bits remain; they should be zero */
3798 if (base64buffer != 0) {
3799 errmsg = "non-zero padding bits in shift sequence";
3800 goto utf7Error;
3801 }
3802 }
3803 }
3804 if (ch != '-') {
3805 /* '-' is absorbed; other terminating
3806 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003807 *p++ = ch;
3808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003809 }
3810 }
3811 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003813 s++; /* consume '+' */
3814 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003815 s++;
3816 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003817 }
3818 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003820 shiftOutStart = p;
3821 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003822 }
3823 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003824 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825 *p++ = ch;
3826 s++;
3827 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003828 else {
3829 startinpos = s-starts;
3830 s++;
3831 errmsg = "unexpected special character";
3832 goto utf7Error;
3833 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003834 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003835utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 outpos = p-PyUnicode_AS_UNICODE(unicode);
3837 endinpos = s-starts;
3838 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 errors, &errorHandler,
3840 "utf7", errmsg,
3841 &starts, &e, &startinpos, &endinpos, &exc, &s,
3842 &unicode, &outpos, &p))
3843 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003844 }
3845
Antoine Pitrou244651a2009-05-04 18:56:13 +00003846 /* end of string */
3847
3848 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3849 /* if we're in an inconsistent state, that's an error */
3850 if (surrogate ||
3851 (base64bits >= 6) ||
3852 (base64bits > 0 && base64buffer != 0)) {
3853 outpos = p-PyUnicode_AS_UNICODE(unicode);
3854 endinpos = size;
3855 if (unicode_decode_call_errorhandler(
3856 errors, &errorHandler,
3857 "utf7", "unterminated shift sequence",
3858 &starts, &e, &startinpos, &endinpos, &exc, &s,
3859 &unicode, &outpos, &p))
3860 goto onError;
3861 if (s < e)
3862 goto restart;
3863 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003865
3866 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003867 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003868 if (inShift) {
3869 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003870 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003871 }
3872 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003873 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003874 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003875 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003876
Victor Stinnerfe226c02011-10-03 03:52:20 +02003877 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878 goto onError;
3879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 Py_XDECREF(errorHandler);
3881 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003882#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003883 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 Py_DECREF(unicode);
3885 return NULL;
3886 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003887#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003888 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003889 return (PyObject *)unicode;
3890
Benjamin Peterson29060642009-01-31 22:14:21 +00003891 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 Py_XDECREF(errorHandler);
3893 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003894 Py_DECREF(unicode);
3895 return NULL;
3896}
3897
3898
Alexander Belopolsky40018472011-02-26 01:02:56 +00003899PyObject *
3900PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003901 Py_ssize_t size,
3902 int base64SetO,
3903 int base64WhiteSpace,
3904 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003905{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003906 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003907 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003908 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003909 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003910 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003911 unsigned int base64bits = 0;
3912 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003913 char * out;
3914 char * start;
3915
3916 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003918
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003919 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003920 return PyErr_NoMemory();
3921
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003923 if (v == NULL)
3924 return NULL;
3925
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003926 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003927 for (;i < size; ++i) {
3928 Py_UNICODE ch = s[i];
3929
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 if (inShift) {
3931 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3932 /* shifting out */
3933 if (base64bits) { /* output remaining bits */
3934 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3935 base64buffer = 0;
3936 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003937 }
3938 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003939 /* Characters not in the BASE64 set implicitly unshift the sequence
3940 so no '-' is required, except if the character is itself a '-' */
3941 if (IS_BASE64(ch) || ch == '-') {
3942 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003943 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003944 *out++ = (char) ch;
3945 }
3946 else {
3947 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003948 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003949 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003950 else { /* not in a shift sequence */
3951 if (ch == '+') {
3952 *out++ = '+';
3953 *out++ = '-';
3954 }
3955 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3956 *out++ = (char) ch;
3957 }
3958 else {
3959 *out++ = '+';
3960 inShift = 1;
3961 goto encode_char;
3962 }
3963 }
3964 continue;
3965encode_char:
3966#ifdef Py_UNICODE_WIDE
3967 if (ch >= 0x10000) {
3968 /* code first surrogate */
3969 base64bits += 16;
3970 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3971 while (base64bits >= 6) {
3972 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3973 base64bits -= 6;
3974 }
3975 /* prepare second surrogate */
3976 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3977 }
3978#endif
3979 base64bits += 16;
3980 base64buffer = (base64buffer << 16) | ch;
3981 while (base64bits >= 6) {
3982 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3983 base64bits -= 6;
3984 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003985 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003986 if (base64bits)
3987 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3988 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003989 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003990 if (_PyBytes_Resize(&v, out - start) < 0)
3991 return NULL;
3992 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003993}
3994
Antoine Pitrou244651a2009-05-04 18:56:13 +00003995#undef IS_BASE64
3996#undef FROM_BASE64
3997#undef TO_BASE64
3998#undef DECODE_DIRECT
3999#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004000
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001/* --- UTF-8 Codec -------------------------------------------------------- */
4002
Tim Petersced69f82003-09-16 20:30:58 +00004003static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004005 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4006 illegal prefix. See RFC 3629 for details */
4007 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4008 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004009 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4011 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4012 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4013 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004014 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4015 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4017 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004018 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4019 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4020 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4021 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4022 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023};
4024
Alexander Belopolsky40018472011-02-26 01:02:56 +00004025PyObject *
4026PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004027 Py_ssize_t size,
4028 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029{
Walter Dörwald69652032004-09-07 20:24:22 +00004030 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4031}
4032
Antoine Pitrouab868312009-01-10 15:40:25 +00004033/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4034#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4035
4036/* Mask to quickly check whether a C 'long' contains a
4037 non-ASCII, UTF8-encoded char. */
4038#if (SIZEOF_LONG == 8)
4039# define ASCII_CHAR_MASK 0x8080808080808080L
4040#elif (SIZEOF_LONG == 4)
4041# define ASCII_CHAR_MASK 0x80808080L
4042#else
4043# error C 'long' size should be either 4 or 8!
4044#endif
4045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046/* Scans a UTF-8 string and returns the maximum character to be expected,
4047 the size of the decoded unicode string and if any major errors were
4048 encountered.
4049
4050 This function does check basic UTF-8 sanity, it does however NOT CHECK
4051 if the string contains surrogates, and if all continuation bytes are
4052 within the correct ranges, these checks are performed in
4053 PyUnicode_DecodeUTF8Stateful.
4054
4055 If it sets has_errors to 1, it means the value of unicode_size and max_char
4056 will be bogus and you should not rely on useful information in them.
4057 */
4058static Py_UCS4
4059utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4060 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4061 int *has_errors)
4062{
4063 Py_ssize_t n;
4064 Py_ssize_t char_count = 0;
4065 Py_UCS4 max_char = 127, new_max;
4066 Py_UCS4 upper_bound;
4067 const unsigned char *p = (const unsigned char *)s;
4068 const unsigned char *end = p + string_size;
4069 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4070 int err = 0;
4071
4072 for (; p < end && !err; ++p, ++char_count) {
4073 /* Only check value if it's not a ASCII char... */
4074 if (*p < 0x80) {
4075 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4076 an explanation. */
4077 if (!((size_t) p & LONG_PTR_MASK)) {
4078 /* Help register allocation */
4079 register const unsigned char *_p = p;
4080 while (_p < aligned_end) {
4081 unsigned long value = *(unsigned long *) _p;
4082 if (value & ASCII_CHAR_MASK)
4083 break;
4084 _p += SIZEOF_LONG;
4085 char_count += SIZEOF_LONG;
4086 }
4087 p = _p;
4088 if (p == end)
4089 break;
4090 }
4091 }
4092 if (*p >= 0x80) {
4093 n = utf8_code_length[*p];
4094 new_max = max_char;
4095 switch (n) {
4096 /* invalid start byte */
4097 case 0:
4098 err = 1;
4099 break;
4100 case 2:
4101 /* Code points between 0x00FF and 0x07FF inclusive.
4102 Approximate the upper bound of the code point,
4103 if this flips over 255 we can be sure it will be more
4104 than 255 and the string will need 2 bytes per code coint,
4105 if it stays under or equal to 255, we can be sure 1 byte
4106 is enough.
4107 ((*p & 0b00011111) << 6) | 0b00111111 */
4108 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4109 if (max_char < upper_bound)
4110 new_max = upper_bound;
4111 /* Ensure we track at least that we left ASCII space. */
4112 if (new_max < 128)
4113 new_max = 128;
4114 break;
4115 case 3:
4116 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4117 always > 255 and <= 65535 and will always need 2 bytes. */
4118 if (max_char < 65535)
4119 new_max = 65535;
4120 break;
4121 case 4:
4122 /* Code point will be above 0xFFFF for sure in this case. */
4123 new_max = 65537;
4124 break;
4125 /* Internal error, this should be caught by the first if */
4126 case 1:
4127 default:
4128 assert(0 && "Impossible case in utf8_max_char_and_size");
4129 err = 1;
4130 }
4131 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004132 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 --n;
4134 /* Check if the follow up chars are all valid continuation bytes */
4135 if (n >= 1) {
4136 const unsigned char *cont;
4137 if ((p + n) >= end) {
4138 if (consumed == 0)
4139 /* incomplete data, non-incremental decoding */
4140 err = 1;
4141 break;
4142 }
4143 for (cont = p + 1; cont < (p + n); ++cont) {
4144 if ((*cont & 0xc0) != 0x80) {
4145 err = 1;
4146 break;
4147 }
4148 }
4149 p += n;
4150 }
4151 else
4152 err = 1;
4153 max_char = new_max;
4154 }
4155 }
4156
4157 if (unicode_size)
4158 *unicode_size = char_count;
4159 if (has_errors)
4160 *has_errors = err;
4161 return max_char;
4162}
4163
4164/* Similar to PyUnicode_WRITE but can also write into wstr field
4165 of the legacy unicode representation */
4166#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4167 do { \
4168 const int k_ = (kind); \
4169 if (k_ == PyUnicode_WCHAR_KIND) \
4170 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4171 else if (k_ == PyUnicode_1BYTE_KIND) \
4172 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4173 else if (k_ == PyUnicode_2BYTE_KIND) \
4174 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4175 else \
4176 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4177 } while (0)
4178
Alexander Belopolsky40018472011-02-26 01:02:56 +00004179PyObject *
4180PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 Py_ssize_t size,
4182 const char *errors,
4183 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004187 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t startinpos;
4189 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004190 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004192 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 PyObject *errorHandler = NULL;
4194 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195 Py_UCS4 maxchar = 0;
4196 Py_ssize_t unicode_size;
4197 Py_ssize_t i;
4198 int kind;
4199 void *data;
4200 int has_errors;
4201 Py_UNICODE *error_outptr;
4202#if SIZEOF_WCHAR_T == 2
4203 Py_ssize_t wchar_offset = 0;
4204#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205
Walter Dörwald69652032004-09-07 20:24:22 +00004206 if (size == 0) {
4207 if (consumed)
4208 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004209 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4212 consumed, &has_errors);
4213 if (has_errors) {
4214 unicode = _PyUnicode_New(size);
4215 if (!unicode)
4216 return NULL;
4217 kind = PyUnicode_WCHAR_KIND;
4218 data = PyUnicode_AS_UNICODE(unicode);
4219 assert(data != NULL);
4220 }
4221 else {
4222 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4223 if (!unicode)
4224 return NULL;
4225 /* When the string is ASCII only, just use memcpy and return.
4226 unicode_size may be != size if there is an incomplete UTF-8
4227 sequence at the end of the ASCII block. */
4228 if (maxchar < 128 && size == unicode_size) {
4229 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4230 return (PyObject *)unicode;
4231 }
4232 kind = PyUnicode_KIND(unicode);
4233 data = PyUnicode_DATA(unicode);
4234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004238 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239
4240 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004241 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242
4243 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004244 /* Fast path for runs of ASCII characters. Given that common UTF-8
4245 input will consist of an overwhelming majority of ASCII
4246 characters, we try to optimize for this case by checking
4247 as many characters as a C 'long' can contain.
4248 First, check if we can do an aligned read, as most CPUs have
4249 a penalty for unaligned reads.
4250 */
4251 if (!((size_t) s & LONG_PTR_MASK)) {
4252 /* Help register allocation */
4253 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004254 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004255 while (_s < aligned_end) {
4256 /* Read a whole long at a time (either 4 or 8 bytes),
4257 and do a fast unrolled copy if it only contains ASCII
4258 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004259 unsigned long value = *(unsigned long *) _s;
4260 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004261 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004262 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4263 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4264 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4265 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004266#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4268 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4269 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4270 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004271#endif
4272 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004274 }
4275 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004276 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004277 if (s == e)
4278 break;
4279 ch = (unsigned char)*s;
4280 }
4281 }
4282
4283 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 s++;
4286 continue;
4287 }
4288
4289 n = utf8_code_length[ch];
4290
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004291 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 if (consumed)
4293 break;
4294 else {
4295 errmsg = "unexpected end of data";
4296 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004297 endinpos = startinpos+1;
4298 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4299 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 goto utf8Error;
4301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303
4304 switch (n) {
4305
4306 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004307 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 startinpos = s-starts;
4309 endinpos = startinpos+1;
4310 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311
4312 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004313 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 startinpos = s-starts;
4315 endinpos = startinpos+1;
4316 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317
4318 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004319 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004320 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004322 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 goto utf8Error;
4324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004326 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004327 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 break;
4329
4330 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004331 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4332 will result in surrogates in range d800-dfff. Surrogates are
4333 not valid UTF-8 so they are rejected.
4334 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4335 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004336 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004337 (s[2] & 0xc0) != 0x80 ||
4338 ((unsigned char)s[0] == 0xE0 &&
4339 (unsigned char)s[1] < 0xA0) ||
4340 ((unsigned char)s[0] == 0xED &&
4341 (unsigned char)s[1] > 0x9F)) {
4342 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004344 endinpos = startinpos + 1;
4345
4346 /* if s[1] first two bits are 1 and 0, then the invalid
4347 continuation byte is s[2], so increment endinpos by 1,
4348 if not, s[1] is invalid and endinpos doesn't need to
4349 be incremented. */
4350 if ((s[1] & 0xC0) == 0x80)
4351 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 goto utf8Error;
4353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004355 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004356 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004357 break;
4358
4359 case 4:
4360 if ((s[1] & 0xc0) != 0x80 ||
4361 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004362 (s[3] & 0xc0) != 0x80 ||
4363 ((unsigned char)s[0] == 0xF0 &&
4364 (unsigned char)s[1] < 0x90) ||
4365 ((unsigned char)s[0] == 0xF4 &&
4366 (unsigned char)s[1] > 0x8F)) {
4367 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004369 endinpos = startinpos + 1;
4370 if ((s[1] & 0xC0) == 0x80) {
4371 endinpos++;
4372 if ((s[2] & 0xC0) == 0x80)
4373 endinpos++;
4374 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 goto utf8Error;
4376 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004377 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004378 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4379 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004381 /* If the string is flexible or we have native UCS-4, write
4382 directly.. */
4383 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4384 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004386 else {
4387 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004389 /* translate from 10000..10FFFF to 0..FFFF */
4390 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 /* high surrogate = top 10 bits added to D800 */
4393 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4394 (Py_UNICODE)(0xD800 + (ch >> 10)));
4395
4396 /* low surrogate = bottom 10 bits added to DC00 */
4397 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4398 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4399 }
4400#if SIZEOF_WCHAR_T == 2
4401 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004402#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 }
4405 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004407
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004409 /* If this is not yet a resizable string, make it one.. */
4410 if (kind != PyUnicode_WCHAR_KIND) {
4411 const Py_UNICODE *u;
4412 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4413 if (!new_unicode)
4414 goto onError;
4415 u = PyUnicode_AsUnicode((PyObject *)unicode);
4416 if (!u)
4417 goto onError;
4418#if SIZEOF_WCHAR_T == 2
4419 i += wchar_offset;
4420#endif
4421 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4422 Py_DECREF(unicode);
4423 unicode = new_unicode;
4424 kind = 0;
4425 data = PyUnicode_AS_UNICODE(new_unicode);
4426 assert(data != NULL);
4427 }
4428 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 if (unicode_decode_call_errorhandler(
4430 errors, &errorHandler,
4431 "utf8", errmsg,
4432 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004433 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004435 /* Update data because unicode_decode_call_errorhandler might have
4436 re-created or resized the unicode object. */
4437 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004440 /* Ensure the unicode_size calculation above was correct: */
4441 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4442
Walter Dörwald69652032004-09-07 20:24:22 +00004443 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004446 /* Adjust length and ready string when it contained errors and
4447 is of the old resizable kind. */
4448 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004449 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004450 goto onError;
4451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 Py_XDECREF(errorHandler);
4454 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004455#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004456 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004457 Py_DECREF(unicode);
4458 return NULL;
4459 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004460#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004461 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 return (PyObject *)unicode;
4463
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 Py_XDECREF(errorHandler);
4466 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 Py_DECREF(unicode);
4468 return NULL;
4469}
4470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004471#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004472
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004473#ifdef __APPLE__
4474
4475/* Simplified UTF-8 decoder using surrogateescape error handler,
4476 used to decode the command line arguments on Mac OS X. */
4477
4478wchar_t*
4479_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4480{
4481 int n;
4482 const char *e;
4483 wchar_t *unicode, *p;
4484
4485 /* Note: size will always be longer than the resulting Unicode
4486 character count */
4487 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4488 PyErr_NoMemory();
4489 return NULL;
4490 }
4491 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4492 if (!unicode)
4493 return NULL;
4494
4495 /* Unpack UTF-8 encoded data */
4496 p = unicode;
4497 e = s + size;
4498 while (s < e) {
4499 Py_UCS4 ch = (unsigned char)*s;
4500
4501 if (ch < 0x80) {
4502 *p++ = (wchar_t)ch;
4503 s++;
4504 continue;
4505 }
4506
4507 n = utf8_code_length[ch];
4508 if (s + n > e) {
4509 goto surrogateescape;
4510 }
4511
4512 switch (n) {
4513 case 0:
4514 case 1:
4515 goto surrogateescape;
4516
4517 case 2:
4518 if ((s[1] & 0xc0) != 0x80)
4519 goto surrogateescape;
4520 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4521 assert ((ch > 0x007F) && (ch <= 0x07FF));
4522 *p++ = (wchar_t)ch;
4523 break;
4524
4525 case 3:
4526 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4527 will result in surrogates in range d800-dfff. Surrogates are
4528 not valid UTF-8 so they are rejected.
4529 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4530 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4531 if ((s[1] & 0xc0) != 0x80 ||
4532 (s[2] & 0xc0) != 0x80 ||
4533 ((unsigned char)s[0] == 0xE0 &&
4534 (unsigned char)s[1] < 0xA0) ||
4535 ((unsigned char)s[0] == 0xED &&
4536 (unsigned char)s[1] > 0x9F)) {
4537
4538 goto surrogateescape;
4539 }
4540 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4541 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004542 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004543 break;
4544
4545 case 4:
4546 if ((s[1] & 0xc0) != 0x80 ||
4547 (s[2] & 0xc0) != 0x80 ||
4548 (s[3] & 0xc0) != 0x80 ||
4549 ((unsigned char)s[0] == 0xF0 &&
4550 (unsigned char)s[1] < 0x90) ||
4551 ((unsigned char)s[0] == 0xF4 &&
4552 (unsigned char)s[1] > 0x8F)) {
4553 goto surrogateescape;
4554 }
4555 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4556 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4557 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4558
4559#if SIZEOF_WCHAR_T == 4
4560 *p++ = (wchar_t)ch;
4561#else
4562 /* compute and append the two surrogates: */
4563
4564 /* translate from 10000..10FFFF to 0..FFFF */
4565 ch -= 0x10000;
4566
4567 /* high surrogate = top 10 bits added to D800 */
4568 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4569
4570 /* low surrogate = bottom 10 bits added to DC00 */
4571 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4572#endif
4573 break;
4574 }
4575 s += n;
4576 continue;
4577
4578 surrogateescape:
4579 *p++ = 0xDC00 + ch;
4580 s++;
4581 }
4582 *p = L'\0';
4583 return unicode;
4584}
4585
4586#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004588/* Primary internal function which creates utf8 encoded bytes objects.
4589
4590 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004591 and allocate exactly as much space needed at the end. Else allocate the
4592 maximum possible needed (4 result bytes per Unicode character), and return
4593 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004594*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004595PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004596_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597{
Tim Peters602f7402002-04-27 18:03:26 +00004598#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004599
Guido van Rossum98297ee2007-11-06 21:34:58 +00004600 Py_ssize_t i; /* index into s of next input byte */
4601 PyObject *result; /* result string object */
4602 char *p; /* next free byte in output buffer */
4603 Py_ssize_t nallocated; /* number of result bytes allocated */
4604 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004605 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004606 PyObject *errorHandler = NULL;
4607 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004608 int kind;
4609 void *data;
4610 Py_ssize_t size;
4611 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4612#if SIZEOF_WCHAR_T == 2
4613 Py_ssize_t wchar_offset = 0;
4614#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 if (!PyUnicode_Check(unicode)) {
4617 PyErr_BadArgument();
4618 return NULL;
4619 }
4620
4621 if (PyUnicode_READY(unicode) == -1)
4622 return NULL;
4623
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004624 if (PyUnicode_UTF8(unicode))
4625 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4626 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004627
4628 kind = PyUnicode_KIND(unicode);
4629 data = PyUnicode_DATA(unicode);
4630 size = PyUnicode_GET_LENGTH(unicode);
4631
Tim Peters602f7402002-04-27 18:03:26 +00004632 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633
Tim Peters602f7402002-04-27 18:03:26 +00004634 if (size <= MAX_SHORT_UNICHARS) {
4635 /* Write into the stack buffer; nallocated can't overflow.
4636 * At the end, we'll allocate exactly as much heap space as it
4637 * turns out we need.
4638 */
4639 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004640 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004641 p = stackbuf;
4642 }
4643 else {
4644 /* Overallocate on the heap, and give the excess back at the end. */
4645 nallocated = size * 4;
4646 if (nallocated / 4 != size) /* overflow! */
4647 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004648 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004649 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004650 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004651 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004652 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004653
Tim Peters602f7402002-04-27 18:03:26 +00004654 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004655 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004656
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004657 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004658 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004660
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004662 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004663 *p++ = (char)(0xc0 | (ch >> 6));
4664 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004665 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004666 Py_ssize_t newpos;
4667 PyObject *rep;
4668 Py_ssize_t repsize, k, startpos;
4669 startpos = i-1;
4670#if SIZEOF_WCHAR_T == 2
4671 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004672#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673 rep = unicode_encode_call_errorhandler(
4674 errors, &errorHandler, "utf-8", "surrogates not allowed",
4675 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4676 &exc, startpos, startpos+1, &newpos);
4677 if (!rep)
4678 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680 if (PyBytes_Check(rep))
4681 repsize = PyBytes_GET_SIZE(rep);
4682 else
4683 repsize = PyUnicode_GET_SIZE(rep);
4684
4685 if (repsize > 4) {
4686 Py_ssize_t offset;
4687
4688 if (result == NULL)
4689 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004690 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004691 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004693 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4694 /* integer overflow */
4695 PyErr_NoMemory();
4696 goto error;
4697 }
4698 nallocated += repsize - 4;
4699 if (result != NULL) {
4700 if (_PyBytes_Resize(&result, nallocated) < 0)
4701 goto error;
4702 } else {
4703 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004704 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004705 goto error;
4706 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4707 }
4708 p = PyBytes_AS_STRING(result) + offset;
4709 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004711 if (PyBytes_Check(rep)) {
4712 char *prep = PyBytes_AS_STRING(rep);
4713 for(k = repsize; k > 0; k--)
4714 *p++ = *prep++;
4715 } else /* rep is unicode */ {
4716 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4717 Py_UNICODE c;
4718
4719 for(k=0; k<repsize; k++) {
4720 c = prep[k];
4721 if (0x80 <= c) {
4722 raise_encode_exception(&exc, "utf-8",
4723 PyUnicode_AS_UNICODE(unicode),
4724 size, i-1, i,
4725 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004726 goto error;
4727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004729 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004732 } else if (ch < 0x10000) {
4733 *p++ = (char)(0xe0 | (ch >> 12));
4734 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4735 *p++ = (char)(0x80 | (ch & 0x3f));
4736 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004737 /* Encode UCS4 Unicode ordinals */
4738 *p++ = (char)(0xf0 | (ch >> 18));
4739 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4740 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4741 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004742#if SIZEOF_WCHAR_T == 2
4743 wchar_offset++;
4744#endif
Tim Peters602f7402002-04-27 18:03:26 +00004745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004747
Guido van Rossum98297ee2007-11-06 21:34:58 +00004748 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004749 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004750 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004751 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004752 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004753 }
4754 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004755 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004756 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004757 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004758 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004760
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004761 Py_XDECREF(errorHandler);
4762 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004763 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004764 error:
4765 Py_XDECREF(errorHandler);
4766 Py_XDECREF(exc);
4767 Py_XDECREF(result);
4768 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004769
Tim Peters602f7402002-04-27 18:03:26 +00004770#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771}
4772
Alexander Belopolsky40018472011-02-26 01:02:56 +00004773PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004774PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4775 Py_ssize_t size,
4776 const char *errors)
4777{
4778 PyObject *v, *unicode;
4779
4780 unicode = PyUnicode_FromUnicode(s, size);
4781 if (unicode == NULL)
4782 return NULL;
4783 v = _PyUnicode_AsUTF8String(unicode, errors);
4784 Py_DECREF(unicode);
4785 return v;
4786}
4787
4788PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004789PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792}
4793
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794/* --- UTF-32 Codec ------------------------------------------------------- */
4795
4796PyObject *
4797PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 Py_ssize_t size,
4799 const char *errors,
4800 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004801{
4802 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4803}
4804
4805PyObject *
4806PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004807 Py_ssize_t size,
4808 const char *errors,
4809 int *byteorder,
4810 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004811{
4812 const char *starts = s;
4813 Py_ssize_t startinpos;
4814 Py_ssize_t endinpos;
4815 Py_ssize_t outpos;
4816 PyUnicodeObject *unicode;
4817 Py_UNICODE *p;
4818#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004819 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004820 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004821#else
4822 const int pairs = 0;
4823#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004824 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004825 int bo = 0; /* assume native ordering by default */
4826 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004827 /* Offsets from q for retrieving bytes in the right order. */
4828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4829 int iorder[] = {0, 1, 2, 3};
4830#else
4831 int iorder[] = {3, 2, 1, 0};
4832#endif
4833 PyObject *errorHandler = NULL;
4834 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004835
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836 q = (unsigned char *)s;
4837 e = q + size;
4838
4839 if (byteorder)
4840 bo = *byteorder;
4841
4842 /* Check for BOM marks (U+FEFF) in the input and adjust current
4843 byte order setting accordingly. In native mode, the leading BOM
4844 mark is skipped, in all other modes, it is copied to the output
4845 stream as-is (giving a ZWNBSP character). */
4846 if (bo == 0) {
4847 if (size >= 4) {
4848 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004850#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 if (bom == 0x0000FEFF) {
4852 q += 4;
4853 bo = -1;
4854 }
4855 else if (bom == 0xFFFE0000) {
4856 q += 4;
4857 bo = 1;
4858 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004859#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 if (bom == 0x0000FEFF) {
4861 q += 4;
4862 bo = 1;
4863 }
4864 else if (bom == 0xFFFE0000) {
4865 q += 4;
4866 bo = -1;
4867 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004868#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870 }
4871
4872 if (bo == -1) {
4873 /* force LE */
4874 iorder[0] = 0;
4875 iorder[1] = 1;
4876 iorder[2] = 2;
4877 iorder[3] = 3;
4878 }
4879 else if (bo == 1) {
4880 /* force BE */
4881 iorder[0] = 3;
4882 iorder[1] = 2;
4883 iorder[2] = 1;
4884 iorder[3] = 0;
4885 }
4886
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004887 /* On narrow builds we split characters outside the BMP into two
4888 codepoints => count how much extra space we need. */
4889#ifndef Py_UNICODE_WIDE
4890 for (qq = q; qq < e; qq += 4)
4891 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4892 pairs++;
4893#endif
4894
4895 /* This might be one to much, because of a BOM */
4896 unicode = _PyUnicode_New((size+3)/4+pairs);
4897 if (!unicode)
4898 return NULL;
4899 if (size == 0)
4900 return (PyObject *)unicode;
4901
4902 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004903 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004904
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 Py_UCS4 ch;
4907 /* remaining bytes at the end? (size should be divisible by 4) */
4908 if (e-q<4) {
4909 if (consumed)
4910 break;
4911 errmsg = "truncated data";
4912 startinpos = ((const char *)q)-starts;
4913 endinpos = ((const char *)e)-starts;
4914 goto utf32Error;
4915 /* The remaining input chars are ignored if the callback
4916 chooses to skip the input */
4917 }
4918 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4919 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 if (ch >= 0x110000)
4922 {
4923 errmsg = "codepoint not in range(0x110000)";
4924 startinpos = ((const char *)q)-starts;
4925 endinpos = startinpos+4;
4926 goto utf32Error;
4927 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 if (ch >= 0x10000)
4930 {
4931 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4932 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4933 }
4934 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 *p++ = ch;
4937 q += 4;
4938 continue;
4939 utf32Error:
4940 outpos = p-PyUnicode_AS_UNICODE(unicode);
4941 if (unicode_decode_call_errorhandler(
4942 errors, &errorHandler,
4943 "utf32", errmsg,
4944 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4945 &unicode, &outpos, &p))
4946 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947 }
4948
4949 if (byteorder)
4950 *byteorder = bo;
4951
4952 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954
4955 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004956 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957 goto onError;
4958
4959 Py_XDECREF(errorHandler);
4960 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004961#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004962 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004963 Py_DECREF(unicode);
4964 return NULL;
4965 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004966#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004967 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 return (PyObject *)unicode;
4969
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971 Py_DECREF(unicode);
4972 Py_XDECREF(errorHandler);
4973 Py_XDECREF(exc);
4974 return NULL;
4975}
4976
4977PyObject *
4978PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 Py_ssize_t size,
4980 const char *errors,
4981 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004983 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004985 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004987 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988#else
4989 const int pairs = 0;
4990#endif
4991 /* Offsets from p for storing byte pairs in the right order. */
4992#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4993 int iorder[] = {0, 1, 2, 3};
4994#else
4995 int iorder[] = {3, 2, 1, 0};
4996#endif
4997
Benjamin Peterson29060642009-01-31 22:14:21 +00004998#define STORECHAR(CH) \
4999 do { \
5000 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5001 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5002 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5003 p[iorder[0]] = (CH) & 0xff; \
5004 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005 } while(0)
5006
5007 /* In narrow builds we can output surrogate pairs as one codepoint,
5008 so we need less space. */
5009#ifndef Py_UNICODE_WIDE
5010 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5012 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5013 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005014#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005015 nsize = (size - pairs + (byteorder == 0));
5016 bytesize = nsize * 4;
5017 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005019 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005020 if (v == NULL)
5021 return NULL;
5022
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005023 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005024 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005026 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005027 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005028
5029 if (byteorder == -1) {
5030 /* force LE */
5031 iorder[0] = 0;
5032 iorder[1] = 1;
5033 iorder[2] = 2;
5034 iorder[3] = 3;
5035 }
5036 else if (byteorder == 1) {
5037 /* force BE */
5038 iorder[0] = 3;
5039 iorder[1] = 2;
5040 iorder[2] = 1;
5041 iorder[3] = 0;
5042 }
5043
5044 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005046#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5048 Py_UCS4 ch2 = *s;
5049 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5050 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5051 s++;
5052 size--;
5053 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005054 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055#endif
5056 STORECHAR(ch);
5057 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005058
5059 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005060 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061#undef STORECHAR
5062}
5063
Alexander Belopolsky40018472011-02-26 01:02:56 +00005064PyObject *
5065PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066{
5067 if (!PyUnicode_Check(unicode)) {
5068 PyErr_BadArgument();
5069 return NULL;
5070 }
5071 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 PyUnicode_GET_SIZE(unicode),
5073 NULL,
5074 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075}
5076
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077/* --- UTF-16 Codec ------------------------------------------------------- */
5078
Tim Peters772747b2001-08-09 22:21:55 +00005079PyObject *
5080PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 Py_ssize_t size,
5082 const char *errors,
5083 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084{
Walter Dörwald69652032004-09-07 20:24:22 +00005085 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5086}
5087
Antoine Pitrouab868312009-01-10 15:40:25 +00005088/* Two masks for fast checking of whether a C 'long' may contain
5089 UTF16-encoded surrogate characters. This is an efficient heuristic,
5090 assuming that non-surrogate characters with a code point >= 0x8000 are
5091 rare in most input.
5092 FAST_CHAR_MASK is used when the input is in native byte ordering,
5093 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005094*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005095#if (SIZEOF_LONG == 8)
5096# define FAST_CHAR_MASK 0x8000800080008000L
5097# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5098#elif (SIZEOF_LONG == 4)
5099# define FAST_CHAR_MASK 0x80008000L
5100# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5101#else
5102# error C 'long' size should be either 4 or 8!
5103#endif
5104
Walter Dörwald69652032004-09-07 20:24:22 +00005105PyObject *
5106PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 Py_ssize_t size,
5108 const char *errors,
5109 int *byteorder,
5110 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005111{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005113 Py_ssize_t startinpos;
5114 Py_ssize_t endinpos;
5115 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 PyUnicodeObject *unicode;
5117 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005118 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005119 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005120 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005121 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005122 /* Offsets from q for retrieving byte pairs in the right order. */
5123#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5124 int ihi = 1, ilo = 0;
5125#else
5126 int ihi = 0, ilo = 1;
5127#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 PyObject *errorHandler = NULL;
5129 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
5131 /* Note: size will always be longer than the resulting Unicode
5132 character count */
5133 unicode = _PyUnicode_New(size);
5134 if (!unicode)
5135 return NULL;
5136 if (size == 0)
5137 return (PyObject *)unicode;
5138
5139 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005140 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005141 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005142 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143
5144 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005145 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005147 /* Check for BOM marks (U+FEFF) in the input and adjust current
5148 byte order setting accordingly. In native mode, the leading BOM
5149 mark is skipped, in all other modes, it is copied to the output
5150 stream as-is (giving a ZWNBSP character). */
5151 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005152 if (size >= 2) {
5153 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005154#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 if (bom == 0xFEFF) {
5156 q += 2;
5157 bo = -1;
5158 }
5159 else if (bom == 0xFFFE) {
5160 q += 2;
5161 bo = 1;
5162 }
Tim Petersced69f82003-09-16 20:30:58 +00005163#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 if (bom == 0xFEFF) {
5165 q += 2;
5166 bo = 1;
5167 }
5168 else if (bom == 0xFFFE) {
5169 q += 2;
5170 bo = -1;
5171 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005172#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175
Tim Peters772747b2001-08-09 22:21:55 +00005176 if (bo == -1) {
5177 /* force LE */
5178 ihi = 1;
5179 ilo = 0;
5180 }
5181 else if (bo == 1) {
5182 /* force BE */
5183 ihi = 0;
5184 ilo = 1;
5185 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005186#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5187 native_ordering = ilo < ihi;
5188#else
5189 native_ordering = ilo > ihi;
5190#endif
Tim Peters772747b2001-08-09 22:21:55 +00005191
Antoine Pitrouab868312009-01-10 15:40:25 +00005192 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005193 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005195 /* First check for possible aligned read of a C 'long'. Unaligned
5196 reads are more expensive, better to defer to another iteration. */
5197 if (!((size_t) q & LONG_PTR_MASK)) {
5198 /* Fast path for runs of non-surrogate chars. */
5199 register const unsigned char *_q = q;
5200 Py_UNICODE *_p = p;
5201 if (native_ordering) {
5202 /* Native ordering is simple: as long as the input cannot
5203 possibly contain a surrogate char, do an unrolled copy
5204 of several 16-bit code points to the target object.
5205 The non-surrogate check is done on several input bytes
5206 at a time (as many as a C 'long' can contain). */
5207 while (_q < aligned_end) {
5208 unsigned long data = * (unsigned long *) _q;
5209 if (data & FAST_CHAR_MASK)
5210 break;
5211 _p[0] = ((unsigned short *) _q)[0];
5212 _p[1] = ((unsigned short *) _q)[1];
5213#if (SIZEOF_LONG == 8)
5214 _p[2] = ((unsigned short *) _q)[2];
5215 _p[3] = ((unsigned short *) _q)[3];
5216#endif
5217 _q += SIZEOF_LONG;
5218 _p += SIZEOF_LONG / 2;
5219 }
5220 }
5221 else {
5222 /* Byteswapped ordering is similar, but we must decompose
5223 the copy bytewise, and take care of zero'ing out the
5224 upper bytes if the target object is in 32-bit units
5225 (that is, in UCS-4 builds). */
5226 while (_q < aligned_end) {
5227 unsigned long data = * (unsigned long *) _q;
5228 if (data & SWAPPED_FAST_CHAR_MASK)
5229 break;
5230 /* Zero upper bytes in UCS-4 builds */
5231#if (Py_UNICODE_SIZE > 2)
5232 _p[0] = 0;
5233 _p[1] = 0;
5234#if (SIZEOF_LONG == 8)
5235 _p[2] = 0;
5236 _p[3] = 0;
5237#endif
5238#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005239 /* Issue #4916; UCS-4 builds on big endian machines must
5240 fill the two last bytes of each 4-byte unit. */
5241#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5242# define OFF 2
5243#else
5244# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005245#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005246 ((unsigned char *) _p)[OFF + 1] = _q[0];
5247 ((unsigned char *) _p)[OFF + 0] = _q[1];
5248 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5249 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5250#if (SIZEOF_LONG == 8)
5251 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5252 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5253 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5254 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5255#endif
5256#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005257 _q += SIZEOF_LONG;
5258 _p += SIZEOF_LONG / 2;
5259 }
5260 }
5261 p = _p;
5262 q = _q;
5263 if (q >= e)
5264 break;
5265 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005267
Benjamin Peterson14339b62009-01-31 16:36:08 +00005268 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005269
5270 if (ch < 0xD800 || ch > 0xDFFF) {
5271 *p++ = ch;
5272 continue;
5273 }
5274
5275 /* UTF-16 code pair: */
5276 if (q > e) {
5277 errmsg = "unexpected end of data";
5278 startinpos = (((const char *)q) - 2) - starts;
5279 endinpos = ((const char *)e) + 1 - starts;
5280 goto utf16Error;
5281 }
5282 if (0xD800 <= ch && ch <= 0xDBFF) {
5283 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5284 q += 2;
5285 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005286#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 *p++ = ch;
5288 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005289#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005291#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 continue;
5293 }
5294 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005295 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 startinpos = (((const char *)q)-4)-starts;
5297 endinpos = startinpos+2;
5298 goto utf16Error;
5299 }
5300
Benjamin Peterson14339b62009-01-31 16:36:08 +00005301 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 errmsg = "illegal encoding";
5303 startinpos = (((const char *)q)-2)-starts;
5304 endinpos = startinpos+2;
5305 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005306
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 utf16Error:
5308 outpos = p - PyUnicode_AS_UNICODE(unicode);
5309 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005310 errors,
5311 &errorHandler,
5312 "utf16", errmsg,
5313 &starts,
5314 (const char **)&e,
5315 &startinpos,
5316 &endinpos,
5317 &exc,
5318 (const char **)&q,
5319 &unicode,
5320 &outpos,
5321 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005324 /* remaining byte at the end? (size should be even) */
5325 if (e == q) {
5326 if (!consumed) {
5327 errmsg = "truncated data";
5328 startinpos = ((const char *)q) - starts;
5329 endinpos = ((const char *)e) + 1 - starts;
5330 outpos = p - PyUnicode_AS_UNICODE(unicode);
5331 if (unicode_decode_call_errorhandler(
5332 errors,
5333 &errorHandler,
5334 "utf16", errmsg,
5335 &starts,
5336 (const char **)&e,
5337 &startinpos,
5338 &endinpos,
5339 &exc,
5340 (const char **)&q,
5341 &unicode,
5342 &outpos,
5343 &p))
5344 goto onError;
5345 /* The remaining input chars are ignored if the callback
5346 chooses to skip the input */
5347 }
5348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
5350 if (byteorder)
5351 *byteorder = bo;
5352
Walter Dörwald69652032004-09-07 20:24:22 +00005353 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005355
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005357 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 goto onError;
5359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 Py_XDECREF(errorHandler);
5361 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005362#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005363 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005364 Py_DECREF(unicode);
5365 return NULL;
5366 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005367#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005368 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 return (PyObject *)unicode;
5370
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373 Py_XDECREF(errorHandler);
5374 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 return NULL;
5376}
5377
Antoine Pitrouab868312009-01-10 15:40:25 +00005378#undef FAST_CHAR_MASK
5379#undef SWAPPED_FAST_CHAR_MASK
5380
Tim Peters772747b2001-08-09 22:21:55 +00005381PyObject *
5382PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 Py_ssize_t size,
5384 const char *errors,
5385 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005387 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005388 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005389 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005390#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005391 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005392#else
5393 const int pairs = 0;
5394#endif
Tim Peters772747b2001-08-09 22:21:55 +00005395 /* Offsets from p for storing byte pairs in the right order. */
5396#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5397 int ihi = 1, ilo = 0;
5398#else
5399 int ihi = 0, ilo = 1;
5400#endif
5401
Benjamin Peterson29060642009-01-31 22:14:21 +00005402#define STORECHAR(CH) \
5403 do { \
5404 p[ihi] = ((CH) >> 8) & 0xff; \
5405 p[ilo] = (CH) & 0xff; \
5406 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005407 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005409#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005410 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 if (s[i] >= 0x10000)
5412 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005413#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005414 /* 2 * (size + pairs + (byteorder == 0)) */
5415 if (size > PY_SSIZE_T_MAX ||
5416 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005418 nsize = size + pairs + (byteorder == 0);
5419 bytesize = nsize * 2;
5420 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005422 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 if (v == NULL)
5424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005426 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005429 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005430 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005431
5432 if (byteorder == -1) {
5433 /* force LE */
5434 ihi = 1;
5435 ilo = 0;
5436 }
5437 else if (byteorder == 1) {
5438 /* force BE */
5439 ihi = 0;
5440 ilo = 1;
5441 }
5442
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005443 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 Py_UNICODE ch = *s++;
5445 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005446#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 if (ch >= 0x10000) {
5448 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5449 ch = 0xD800 | ((ch-0x10000) >> 10);
5450 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005451#endif
Tim Peters772747b2001-08-09 22:21:55 +00005452 STORECHAR(ch);
5453 if (ch2)
5454 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005455 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005456
5457 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005458 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005459#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460}
5461
Alexander Belopolsky40018472011-02-26 01:02:56 +00005462PyObject *
5463PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464{
5465 if (!PyUnicode_Check(unicode)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
5469 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 PyUnicode_GET_SIZE(unicode),
5471 NULL,
5472 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473}
5474
5475/* --- Unicode Escape Codec ----------------------------------------------- */
5476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005477/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5478 if all the escapes in the string make it still a valid ASCII string.
5479 Returns -1 if any escapes were found which cause the string to
5480 pop out of ASCII range. Otherwise returns the length of the
5481 required buffer to hold the string.
5482 */
5483Py_ssize_t
5484length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5485{
5486 const unsigned char *p = (const unsigned char *)s;
5487 const unsigned char *end = p + size;
5488 Py_ssize_t length = 0;
5489
5490 if (size < 0)
5491 return -1;
5492
5493 for (; p < end; ++p) {
5494 if (*p > 127) {
5495 /* Non-ASCII */
5496 return -1;
5497 }
5498 else if (*p != '\\') {
5499 /* Normal character */
5500 ++length;
5501 }
5502 else {
5503 /* Backslash-escape, check next char */
5504 ++p;
5505 /* Escape sequence reaches till end of string or
5506 non-ASCII follow-up. */
5507 if (p >= end || *p > 127)
5508 return -1;
5509 switch (*p) {
5510 case '\n':
5511 /* backslash + \n result in zero characters */
5512 break;
5513 case '\\': case '\'': case '\"':
5514 case 'b': case 'f': case 't':
5515 case 'n': case 'r': case 'v': case 'a':
5516 ++length;
5517 break;
5518 case '0': case '1': case '2': case '3':
5519 case '4': case '5': case '6': case '7':
5520 case 'x': case 'u': case 'U': case 'N':
5521 /* these do not guarantee ASCII characters */
5522 return -1;
5523 default:
5524 /* count the backslash + the other character */
5525 length += 2;
5526 }
5527 }
5528 }
5529 return length;
5530}
5531
5532/* Similar to PyUnicode_WRITE but either write into wstr field
5533 or treat string as ASCII. */
5534#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5535 do { \
5536 if ((kind) != PyUnicode_WCHAR_KIND) \
5537 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5538 else \
5539 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5540 } while (0)
5541
5542#define WRITE_WSTR(buf, index, value) \
5543 assert(kind == PyUnicode_WCHAR_KIND), \
5544 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5545
5546
Fredrik Lundh06d12682001-01-24 07:59:11 +00005547static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005548
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyObject *
5550PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005551 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005555 Py_ssize_t startinpos;
5556 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005561 char* message;
5562 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 PyObject *errorHandler = NULL;
5564 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565 Py_ssize_t ascii_length;
5566 Py_ssize_t i;
5567 int kind;
5568 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 ascii_length = length_of_escaped_ascii_string(s, size);
5571
5572 /* After length_of_escaped_ascii_string() there are two alternatives,
5573 either the string is pure ASCII with named escapes like \n, etc.
5574 and we determined it's exact size (common case)
5575 or it contains \x, \u, ... escape sequences. then we create a
5576 legacy wchar string and resize it at the end of this function. */
5577 if (ascii_length >= 0) {
5578 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5579 if (!v)
5580 goto onError;
5581 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5582 kind = PyUnicode_1BYTE_KIND;
5583 data = PyUnicode_DATA(v);
5584 }
5585 else {
5586 /* Escaped strings will always be longer than the resulting
5587 Unicode string, so we start with size here and then reduce the
5588 length after conversion to the true value.
5589 (but if the error callback returns a long replacement string
5590 we'll have to allocate more space) */
5591 v = _PyUnicode_New(size);
5592 if (!v)
5593 goto onError;
5594 kind = PyUnicode_WCHAR_KIND;
5595 data = PyUnicode_AS_UNICODE(v);
5596 }
5597
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 if (size == 0)
5599 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005600 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005602
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 while (s < end) {
5604 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005605 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608 if (kind == PyUnicode_WCHAR_KIND) {
5609 assert(i < _PyUnicode_WSTR_LENGTH(v));
5610 }
5611 else {
5612 /* The only case in which i == ascii_length is a backslash
5613 followed by a newline. */
5614 assert(i <= ascii_length);
5615 }
5616
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 /* Non-escape characters are interpreted as Unicode ordinals */
5618 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005619 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 continue;
5621 }
5622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 /* \ - Escapes */
5625 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005626 c = *s++;
5627 if (s > end)
5628 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005629
5630 if (kind == PyUnicode_WCHAR_KIND) {
5631 assert(i < _PyUnicode_WSTR_LENGTH(v));
5632 }
5633 else {
5634 /* The only case in which i == ascii_length is a backslash
5635 followed by a newline. */
5636 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5637 }
5638
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005639 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5644 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5645 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5646 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5647 /* FF */
5648 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5649 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5650 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5651 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5652 /* VT */
5653 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5654 /* BEL, not classic C */
5655 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 case '0': case '1': case '2': case '3':
5659 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005660 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005661 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005662 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005663 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005664 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005666 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 break;
5668
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 /* hex escapes */
5670 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005672 digits = 2;
5673 message = "truncated \\xXX escape";
5674 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005678 digits = 4;
5679 message = "truncated \\uXXXX escape";
5680 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005683 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005684 digits = 8;
5685 message = "truncated \\UXXXXXXXX escape";
5686 hexescape:
5687 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 if (s+digits>end) {
5690 endinpos = size;
5691 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 errors, &errorHandler,
5693 "unicodeescape", "end of string in escape sequence",
5694 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005695 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 goto nextByte;
5699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700 for (j = 0; j < digits; ++j) {
5701 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005702 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 endinpos = (s+j+1)-starts;
5704 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 errors, &errorHandler,
5707 "unicodeescape", message,
5708 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005710 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005713 }
5714 chr = (chr<<4) & ~0xF;
5715 if (c >= '0' && c <= '9')
5716 chr += c - '0';
5717 else if (c >= 'a' && c <= 'f')
5718 chr += 10 + c - 'a';
5719 else
5720 chr += 10 + c - 'A';
5721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005722 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005723 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 /* _decoding_error will have already written into the
5725 target buffer. */
5726 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005727 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005728 /* when we get here, chr is a 32-bit unicode character */
5729 if (chr <= 0xffff)
5730 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005731 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005732 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005733 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005734 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005735#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005736 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005737#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005738 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005739 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5740 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005741#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005742 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005744 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 errors, &errorHandler,
5747 "unicodeescape", "illegal Unicode character",
5748 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005749 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005750 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005751 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005752 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005753 break;
5754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005756 case 'N':
5757 message = "malformed \\N character escape";
5758 if (ucnhash_CAPI == NULL) {
5759 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005760 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5761 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005762 if (ucnhash_CAPI == NULL)
5763 goto ucnhashError;
5764 }
5765 if (*s == '{') {
5766 const char *start = s+1;
5767 /* look for the closing brace */
5768 while (*s != '}' && s < end)
5769 s++;
5770 if (s > start && s < end && *s == '}') {
5771 /* found a name. look it up in the unicode database */
5772 message = "unknown Unicode character name";
5773 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5775 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005776 goto store;
5777 }
5778 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 errors, &errorHandler,
5783 "unicodeescape", message,
5784 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005785 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005786 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005787 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005788 break;
5789
5790 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005791 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005792 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 message = "\\ at end of string";
5794 s--;
5795 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005796 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 errors, &errorHandler,
5799 "unicodeescape", message,
5800 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005801 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005802 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005803 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005804 }
5805 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005806 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5807 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005808 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005809 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005814 /* Ensure the length prediction worked in case of ASCII strings */
5815 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5816
Victor Stinnerfe226c02011-10-03 03:52:20 +02005817 if (kind == PyUnicode_WCHAR_KIND)
5818 {
5819 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5820 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005821 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005822 Py_XDECREF(errorHandler);
5823 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005824#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005825 if (_PyUnicode_READY_REPLACE(&v)) {
5826 Py_DECREF(v);
5827 return NULL;
5828 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005829#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005830 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005832
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005834 PyErr_SetString(
5835 PyExc_UnicodeError,
5836 "\\N escapes not supported (can't load unicodedata module)"
5837 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005838 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005841 return NULL;
5842
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 return NULL;
5848}
5849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005850#undef WRITE_ASCII_OR_WSTR
5851#undef WRITE_WSTR
5852
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853/* Return a Unicode-Escape string version of the Unicode object.
5854
5855 If quotes is true, the string is enclosed in u"" or u'' quotes as
5856 appropriate.
5857
5858*/
5859
Walter Dörwald79e913e2007-05-12 11:08:06 +00005860static const char *hexdigits = "0123456789abcdef";
5861
Alexander Belopolsky40018472011-02-26 01:02:56 +00005862PyObject *
5863PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005864 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005866 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005869#ifdef Py_UNICODE_WIDE
5870 const Py_ssize_t expandsize = 10;
5871#else
5872 const Py_ssize_t expandsize = 6;
5873#endif
5874
Thomas Wouters89f507f2006-12-13 04:49:30 +00005875 /* XXX(nnorwitz): rather than over-allocating, it would be
5876 better to choose a different scheme. Perhaps scan the
5877 first N-chars of the string and allocate based on that size.
5878 */
5879 /* Initial allocation is based on the longest-possible unichr
5880 escape.
5881
5882 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5883 unichr, so in this case it's the longest unichr escape. In
5884 narrow (UTF-16) builds this is five chars per source unichr
5885 since there are two unichrs in the surrogate pair, so in narrow
5886 (UTF-16) builds it's not the longest unichr escape.
5887
5888 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5889 so in the narrow (UTF-16) build case it's the longest unichr
5890 escape.
5891 */
5892
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005893 if (size == 0)
5894 return PyBytes_FromStringAndSize(NULL, 0);
5895
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005896 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005898
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005899 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 2
5901 + expandsize*size
5902 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 if (repr == NULL)
5904 return NULL;
5905
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005906 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 while (size-- > 0) {
5909 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005910
Walter Dörwald79e913e2007-05-12 11:08:06 +00005911 /* Escape backslashes */
5912 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 *p++ = '\\';
5914 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005915 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005916 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005917
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005918#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005919 /* Map 21-bit characters to '\U00xxxxxx' */
5920 else if (ch >= 0x10000) {
5921 *p++ = '\\';
5922 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005923 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5924 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5925 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5926 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5927 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5928 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5929 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5930 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005932 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005933#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5935 else if (ch >= 0xD800 && ch < 0xDC00) {
5936 Py_UNICODE ch2;
5937 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005938
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 ch2 = *s++;
5940 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005941 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5943 *p++ = '\\';
5944 *p++ = 'U';
5945 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5946 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5947 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5948 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5949 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5950 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5951 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5952 *p++ = hexdigits[ucs & 0x0000000F];
5953 continue;
5954 }
5955 /* Fall through: isolated surrogates are copied as-is */
5956 s--;
5957 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005958 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005959#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005960
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005962 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 *p++ = '\\';
5964 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005965 *p++ = hexdigits[(ch >> 12) & 0x000F];
5966 *p++ = hexdigits[(ch >> 8) & 0x000F];
5967 *p++ = hexdigits[(ch >> 4) & 0x000F];
5968 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005971 /* Map special whitespace to '\t', \n', '\r' */
5972 else if (ch == '\t') {
5973 *p++ = '\\';
5974 *p++ = 't';
5975 }
5976 else if (ch == '\n') {
5977 *p++ = '\\';
5978 *p++ = 'n';
5979 }
5980 else if (ch == '\r') {
5981 *p++ = '\\';
5982 *p++ = 'r';
5983 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005984
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005985 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005986 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005988 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005989 *p++ = hexdigits[(ch >> 4) & 0x000F];
5990 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005991 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005992
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 /* Copy everything else as-is */
5994 else
5995 *p++ = (char) ch;
5996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005998 assert(p - PyBytes_AS_STRING(repr) > 0);
5999 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6000 return NULL;
6001 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002}
6003
Alexander Belopolsky40018472011-02-26 01:02:56 +00006004PyObject *
6005PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006007 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 if (!PyUnicode_Check(unicode)) {
6009 PyErr_BadArgument();
6010 return NULL;
6011 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006012 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6013 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006014 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015}
6016
6017/* --- Raw Unicode Escape Codec ------------------------------------------- */
6018
Alexander Belopolsky40018472011-02-26 01:02:56 +00006019PyObject *
6020PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006021 Py_ssize_t size,
6022 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006025 Py_ssize_t startinpos;
6026 Py_ssize_t endinpos;
6027 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 const char *end;
6031 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032 PyObject *errorHandler = NULL;
6033 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006034
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 /* Escaped strings will always be longer than the resulting
6036 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 length after conversion to the true value. (But decoding error
6038 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 v = _PyUnicode_New(size);
6040 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 end = s + size;
6046 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 unsigned char c;
6048 Py_UCS4 x;
6049 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006050 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* Non-escape characters are interpreted as Unicode ordinals */
6053 if (*s != '\\') {
6054 *p++ = (unsigned char)*s++;
6055 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 startinpos = s-starts;
6058
6059 /* \u-escapes are only interpreted iff the number of leading
6060 backslashes if odd */
6061 bs = s;
6062 for (;s < end;) {
6063 if (*s != '\\')
6064 break;
6065 *p++ = (unsigned char)*s++;
6066 }
6067 if (((s - bs) & 1) == 0 ||
6068 s >= end ||
6069 (*s != 'u' && *s != 'U')) {
6070 continue;
6071 }
6072 p--;
6073 count = *s=='u' ? 4 : 8;
6074 s++;
6075
6076 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6077 outpos = p-PyUnicode_AS_UNICODE(v);
6078 for (x = 0, i = 0; i < count; ++i, ++s) {
6079 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006080 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 endinpos = s-starts;
6082 if (unicode_decode_call_errorhandler(
6083 errors, &errorHandler,
6084 "rawunicodeescape", "truncated \\uXXXX",
6085 &starts, &end, &startinpos, &endinpos, &exc, &s,
6086 &v, &outpos, &p))
6087 goto onError;
6088 goto nextByte;
6089 }
6090 x = (x<<4) & ~0xF;
6091 if (c >= '0' && c <= '9')
6092 x += c - '0';
6093 else if (c >= 'a' && c <= 'f')
6094 x += 10 + c - 'a';
6095 else
6096 x += 10 + c - 'A';
6097 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006098 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 /* UCS-2 character */
6100 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006101 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 /* UCS-4 character. Either store directly, or as
6103 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006104#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006106#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 x -= 0x10000L;
6108 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6109 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006110#endif
6111 } else {
6112 endinpos = s-starts;
6113 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006114 if (unicode_decode_call_errorhandler(
6115 errors, &errorHandler,
6116 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 &starts, &end, &startinpos, &endinpos, &exc, &s,
6118 &v, &outpos, &p))
6119 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006120 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 nextByte:
6122 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006124 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 Py_XDECREF(errorHandler);
6127 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006128#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006129 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006130 Py_DECREF(v);
6131 return NULL;
6132 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006133#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006134 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006136
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 Py_XDECREF(errorHandler);
6140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 return NULL;
6142}
6143
Alexander Belopolsky40018472011-02-26 01:02:56 +00006144PyObject *
6145PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006146 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006148 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 char *p;
6150 char *q;
6151
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006152#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006153 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006154#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006155 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006156#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006157
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006158 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006160
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006161 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 if (repr == NULL)
6163 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006164 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006165 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006167 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 while (size-- > 0) {
6169 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006170#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 /* Map 32-bit characters to '\Uxxxxxxxx' */
6172 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006173 *p++ = '\\';
6174 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006175 *p++ = hexdigits[(ch >> 28) & 0xf];
6176 *p++ = hexdigits[(ch >> 24) & 0xf];
6177 *p++ = hexdigits[(ch >> 20) & 0xf];
6178 *p++ = hexdigits[(ch >> 16) & 0xf];
6179 *p++ = hexdigits[(ch >> 12) & 0xf];
6180 *p++ = hexdigits[(ch >> 8) & 0xf];
6181 *p++ = hexdigits[(ch >> 4) & 0xf];
6182 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006183 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006184 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006185#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6187 if (ch >= 0xD800 && ch < 0xDC00) {
6188 Py_UNICODE ch2;
6189 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 ch2 = *s++;
6192 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006193 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6195 *p++ = '\\';
6196 *p++ = 'U';
6197 *p++ = hexdigits[(ucs >> 28) & 0xf];
6198 *p++ = hexdigits[(ucs >> 24) & 0xf];
6199 *p++ = hexdigits[(ucs >> 20) & 0xf];
6200 *p++ = hexdigits[(ucs >> 16) & 0xf];
6201 *p++ = hexdigits[(ucs >> 12) & 0xf];
6202 *p++ = hexdigits[(ucs >> 8) & 0xf];
6203 *p++ = hexdigits[(ucs >> 4) & 0xf];
6204 *p++ = hexdigits[ucs & 0xf];
6205 continue;
6206 }
6207 /* Fall through: isolated surrogates are copied as-is */
6208 s--;
6209 size++;
6210 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006211#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* Map 16-bit characters to '\uxxxx' */
6213 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = '\\';
6215 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006216 *p++ = hexdigits[(ch >> 12) & 0xf];
6217 *p++ = hexdigits[(ch >> 8) & 0xf];
6218 *p++ = hexdigits[(ch >> 4) & 0xf];
6219 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 /* Copy everything else as-is */
6222 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 *p++ = (char) ch;
6224 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006225 size = p - q;
6226
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006227 assert(size > 0);
6228 if (_PyBytes_Resize(&repr, size) < 0)
6229 return NULL;
6230 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231}
6232
Alexander Belopolsky40018472011-02-26 01:02:56 +00006233PyObject *
6234PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006236 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006238 PyErr_BadArgument();
6239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006241 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6242 PyUnicode_GET_SIZE(unicode));
6243
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006244 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245}
6246
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247/* --- Unicode Internal Codec ------------------------------------------- */
6248
Alexander Belopolsky40018472011-02-26 01:02:56 +00006249PyObject *
6250_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006251 Py_ssize_t size,
6252 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006253{
6254 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 Py_ssize_t startinpos;
6256 Py_ssize_t endinpos;
6257 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006258 PyUnicodeObject *v;
6259 Py_UNICODE *p;
6260 const char *end;
6261 const char *reason;
6262 PyObject *errorHandler = NULL;
6263 PyObject *exc = NULL;
6264
Neal Norwitzd43069c2006-01-08 01:12:10 +00006265#ifdef Py_UNICODE_WIDE
6266 Py_UNICODE unimax = PyUnicode_GetMax();
6267#endif
6268
Thomas Wouters89f507f2006-12-13 04:49:30 +00006269 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006270 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6271 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006273 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6274 as string was created with the old API. */
6275 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 p = PyUnicode_AS_UNICODE(v);
6278 end = s + size;
6279
6280 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006281 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006282 /* We have to sanity check the raw data, otherwise doom looms for
6283 some malformed UCS-4 data. */
6284 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006285#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006287#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006288 end-s < Py_UNICODE_SIZE
6289 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006291 startinpos = s - starts;
6292 if (end-s < Py_UNICODE_SIZE) {
6293 endinpos = end-starts;
6294 reason = "truncated input";
6295 }
6296 else {
6297 endinpos = s - starts + Py_UNICODE_SIZE;
6298 reason = "illegal code point (> 0x10FFFF)";
6299 }
6300 outpos = p - PyUnicode_AS_UNICODE(v);
6301 if (unicode_decode_call_errorhandler(
6302 errors, &errorHandler,
6303 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006304 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006305 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006306 goto onError;
6307 }
6308 }
6309 else {
6310 p++;
6311 s += Py_UNICODE_SIZE;
6312 }
6313 }
6314
Victor Stinnerfe226c02011-10-03 03:52:20 +02006315 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006316 goto onError;
6317 Py_XDECREF(errorHandler);
6318 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006319#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006320 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006321 Py_DECREF(v);
6322 return NULL;
6323 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006324#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006325 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326 return (PyObject *)v;
6327
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 Py_XDECREF(v);
6330 Py_XDECREF(errorHandler);
6331 Py_XDECREF(exc);
6332 return NULL;
6333}
6334
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335/* --- Latin-1 Codec ------------------------------------------------------ */
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337PyObject *
6338PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006339 Py_ssize_t size,
6340 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006343 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344}
6345
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006347static void
6348make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006349 const char *encoding,
6350 const Py_UNICODE *unicode, Py_ssize_t size,
6351 Py_ssize_t startpos, Py_ssize_t endpos,
6352 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 *exceptionObject = PyUnicodeEncodeError_Create(
6356 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 }
6358 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6360 goto onError;
6361 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6364 goto onError;
6365 return;
6366 onError:
6367 Py_DECREF(*exceptionObject);
6368 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 }
6370}
6371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006373static void
6374raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006375 const char *encoding,
6376 const Py_UNICODE *unicode, Py_ssize_t size,
6377 Py_ssize_t startpos, Py_ssize_t endpos,
6378 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379{
6380 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384}
6385
6386/* error handling callback helper:
6387 build arguments, call the callback and check the arguments,
6388 put the result into newpos and return the replacement string, which
6389 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006390static PyObject *
6391unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006392 PyObject **errorHandler,
6393 const char *encoding, const char *reason,
6394 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6395 Py_ssize_t startpos, Py_ssize_t endpos,
6396 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006398 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399
6400 PyObject *restuple;
6401 PyObject *resunicode;
6402
6403 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 }
6408
6409 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413
6414 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006419 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 Py_DECREF(restuple);
6421 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006423 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 &resunicode, newpos)) {
6425 Py_DECREF(restuple);
6426 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006428 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6429 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6430 Py_DECREF(restuple);
6431 return NULL;
6432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006435 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6437 Py_DECREF(restuple);
6438 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 Py_INCREF(resunicode);
6441 Py_DECREF(restuple);
6442 return resunicode;
6443}
6444
Alexander Belopolsky40018472011-02-26 01:02:56 +00006445static PyObject *
6446unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006447 Py_ssize_t size,
6448 const char *errors,
6449 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450{
6451 /* output object */
6452 PyObject *res;
6453 /* pointers to the beginning and end+1 of input */
6454 const Py_UNICODE *startp = p;
6455 const Py_UNICODE *endp = p + size;
6456 /* pointer to the beginning of the unencodable characters */
6457 /* const Py_UNICODE *badp = NULL; */
6458 /* pointer into the output */
6459 char *str;
6460 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006462 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6463 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 PyObject *errorHandler = NULL;
6465 PyObject *exc = NULL;
6466 /* the following variable is used for caching string comparisons
6467 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6468 int known_errorHandler = -1;
6469
6470 /* allocate enough for a simple encoding without
6471 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006472 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006473 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006474 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006476 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006477 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 ressize = size;
6479
6480 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* can we encode this? */
6484 if (c<limit) {
6485 /* no overflow check, because we know that the space is enough */
6486 *str++ = (char)c;
6487 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 else {
6490 Py_ssize_t unicodepos = p-startp;
6491 Py_ssize_t requiredsize;
6492 PyObject *repunicode;
6493 Py_ssize_t repsize;
6494 Py_ssize_t newpos;
6495 Py_ssize_t respos;
6496 Py_UNICODE *uni2;
6497 /* startpos for collecting unencodable chars */
6498 const Py_UNICODE *collstart = p;
6499 const Py_UNICODE *collend = p;
6500 /* find all unecodable characters */
6501 while ((collend < endp) && ((*collend)>=limit))
6502 ++collend;
6503 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6504 if (known_errorHandler==-1) {
6505 if ((errors==NULL) || (!strcmp(errors, "strict")))
6506 known_errorHandler = 1;
6507 else if (!strcmp(errors, "replace"))
6508 known_errorHandler = 2;
6509 else if (!strcmp(errors, "ignore"))
6510 known_errorHandler = 3;
6511 else if (!strcmp(errors, "xmlcharrefreplace"))
6512 known_errorHandler = 4;
6513 else
6514 known_errorHandler = 0;
6515 }
6516 switch (known_errorHandler) {
6517 case 1: /* strict */
6518 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6519 goto onError;
6520 case 2: /* replace */
6521 while (collstart++<collend)
6522 *str++ = '?'; /* fall through */
6523 case 3: /* ignore */
6524 p = collend;
6525 break;
6526 case 4: /* xmlcharrefreplace */
6527 respos = str - PyBytes_AS_STRING(res);
6528 /* determine replacement size (temporarily (mis)uses p) */
6529 for (p = collstart, repsize = 0; p < collend; ++p) {
6530 if (*p<10)
6531 repsize += 2+1+1;
6532 else if (*p<100)
6533 repsize += 2+2+1;
6534 else if (*p<1000)
6535 repsize += 2+3+1;
6536 else if (*p<10000)
6537 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006538#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 else
6540 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006541#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 else if (*p<100000)
6543 repsize += 2+5+1;
6544 else if (*p<1000000)
6545 repsize += 2+6+1;
6546 else
6547 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006548#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 }
6550 requiredsize = respos+repsize+(endp-collend);
6551 if (requiredsize > ressize) {
6552 if (requiredsize<2*ressize)
6553 requiredsize = 2*ressize;
6554 if (_PyBytes_Resize(&res, requiredsize))
6555 goto onError;
6556 str = PyBytes_AS_STRING(res) + respos;
6557 ressize = requiredsize;
6558 }
6559 /* generate replacement (temporarily (mis)uses p) */
6560 for (p = collstart; p < collend; ++p) {
6561 str += sprintf(str, "&#%d;", (int)*p);
6562 }
6563 p = collend;
6564 break;
6565 default:
6566 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6567 encoding, reason, startp, size, &exc,
6568 collstart-startp, collend-startp, &newpos);
6569 if (repunicode == NULL)
6570 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006571 if (PyBytes_Check(repunicode)) {
6572 /* Directly copy bytes result to output. */
6573 repsize = PyBytes_Size(repunicode);
6574 if (repsize > 1) {
6575 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006576 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006577 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6578 Py_DECREF(repunicode);
6579 goto onError;
6580 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006581 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006582 ressize += repsize-1;
6583 }
6584 memcpy(str, PyBytes_AsString(repunicode), repsize);
6585 str += repsize;
6586 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006587 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006588 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 /* need more space? (at least enough for what we
6591 have+the replacement+the rest of the string, so
6592 we won't have to check space for encodable characters) */
6593 respos = str - PyBytes_AS_STRING(res);
6594 repsize = PyUnicode_GET_SIZE(repunicode);
6595 requiredsize = respos+repsize+(endp-collend);
6596 if (requiredsize > ressize) {
6597 if (requiredsize<2*ressize)
6598 requiredsize = 2*ressize;
6599 if (_PyBytes_Resize(&res, requiredsize)) {
6600 Py_DECREF(repunicode);
6601 goto onError;
6602 }
6603 str = PyBytes_AS_STRING(res) + respos;
6604 ressize = requiredsize;
6605 }
6606 /* check if there is anything unencodable in the replacement
6607 and copy it to the output */
6608 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6609 c = *uni2;
6610 if (c >= limit) {
6611 raise_encode_exception(&exc, encoding, startp, size,
6612 unicodepos, unicodepos+1, reason);
6613 Py_DECREF(repunicode);
6614 goto onError;
6615 }
6616 *str = (char)c;
6617 }
6618 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006619 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
6622 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006623 /* Resize if we allocated to much */
6624 size = str - PyBytes_AS_STRING(res);
6625 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006626 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006627 if (_PyBytes_Resize(&res, size) < 0)
6628 goto onError;
6629 }
6630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006631 Py_XDECREF(errorHandler);
6632 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006633 return res;
6634
6635 onError:
6636 Py_XDECREF(res);
6637 Py_XDECREF(errorHandler);
6638 Py_XDECREF(exc);
6639 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640}
6641
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642PyObject *
6643PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006644 Py_ssize_t size,
6645 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
Alexander Belopolsky40018472011-02-26 01:02:56 +00006650PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006651_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652{
6653 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 PyErr_BadArgument();
6655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006657 if (PyUnicode_READY(unicode) == -1)
6658 return NULL;
6659 /* Fast path: if it is a one-byte string, construct
6660 bytes object directly. */
6661 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6662 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6663 PyUnicode_GET_LENGTH(unicode));
6664 /* Non-Latin-1 characters present. Defer to above function to
6665 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668 errors);
6669}
6670
6671PyObject*
6672PyUnicode_AsLatin1String(PyObject *unicode)
6673{
6674 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675}
6676
6677/* --- 7-bit ASCII Codec -------------------------------------------------- */
6678
Alexander Belopolsky40018472011-02-26 01:02:56 +00006679PyObject *
6680PyUnicode_DecodeASCII(const char *s,
6681 Py_ssize_t size,
6682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006686 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006687 Py_ssize_t startinpos;
6688 Py_ssize_t endinpos;
6689 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006691 int has_error;
6692 const unsigned char *p = (const unsigned char *)s;
6693 const unsigned char *end = p + size;
6694 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 PyObject *errorHandler = NULL;
6696 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006699 if (size == 1 && (unsigned char)s[0] < 128)
6700 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006701
Victor Stinner702c7342011-10-05 13:50:52 +02006702 has_error = 0;
6703 while (p < end && !has_error) {
6704 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6705 an explanation. */
6706 if (!((size_t) p & LONG_PTR_MASK)) {
6707 /* Help register allocation */
6708 register const unsigned char *_p = p;
6709 while (_p < aligned_end) {
6710 unsigned long value = *(unsigned long *) _p;
6711 if (value & ASCII_CHAR_MASK) {
6712 has_error = 1;
6713 break;
6714 }
6715 _p += SIZEOF_LONG;
6716 }
6717 if (_p == end)
6718 break;
6719 if (has_error)
6720 break;
6721 p = _p;
6722 }
6723 if (*p & 0x80) {
6724 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006725 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006726 }
6727 else {
6728 ++p;
6729 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006730 }
Victor Stinner702c7342011-10-05 13:50:52 +02006731 if (!has_error)
6732 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 v = _PyUnicode_New(size);
6735 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006739 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 e = s + size;
6741 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 register unsigned char c = (unsigned char)*s;
6743 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006744 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 ++s;
6746 }
6747 else {
6748 startinpos = s-starts;
6749 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006750 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 if (unicode_decode_call_errorhandler(
6752 errors, &errorHandler,
6753 "ascii", "ordinal not in range(128)",
6754 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006755 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 goto onError;
6757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 }
Victor Stinner702c7342011-10-05 13:50:52 +02006759 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6760 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 Py_XDECREF(errorHandler);
6763 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006764#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006765 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006766 Py_DECREF(v);
6767 return NULL;
6768 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006769#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006770 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006772
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006775 Py_XDECREF(errorHandler);
6776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 return NULL;
6778}
6779
Alexander Belopolsky40018472011-02-26 01:02:56 +00006780PyObject *
6781PyUnicode_EncodeASCII(const Py_UNICODE *p,
6782 Py_ssize_t size,
6783 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006785 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786}
6787
Alexander Belopolsky40018472011-02-26 01:02:56 +00006788PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006789_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790{
6791 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 PyErr_BadArgument();
6793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006795 if (PyUnicode_READY(unicode) == -1)
6796 return NULL;
6797 /* Fast path: if it is an ASCII-only string, construct bytes object
6798 directly. Else defer to above function to raise the exception. */
6799 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6800 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6801 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006804 errors);
6805}
6806
6807PyObject *
6808PyUnicode_AsASCIIString(PyObject *unicode)
6809{
6810 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811}
6812
Victor Stinner99b95382011-07-04 14:23:54 +02006813#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006814
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006815/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006816
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006817#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818#define NEED_RETRY
6819#endif
6820
6821/* XXX This code is limited to "true" double-byte encodings, as
6822 a) it assumes an incomplete character consists of a single byte, and
6823 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
Alexander Belopolsky40018472011-02-26 01:02:56 +00006826static int
6827is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828{
6829 const char *curr = s + offset;
6830
6831 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 const char *prev = CharPrev(s, curr);
6833 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006834 }
6835 return 0;
6836}
6837
6838/*
6839 * Decode MBCS string into unicode object. If 'final' is set, converts
6840 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6841 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006842static int
6843decode_mbcs(PyUnicodeObject **v,
6844 const char *s, /* MBCS string */
6845 int size, /* sizeof MBCS string */
6846 int final,
6847 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848{
6849 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006850 Py_ssize_t n;
6851 DWORD usize;
6852 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 assert(size >= 0);
6855
Victor Stinner554f3f02010-06-16 23:33:54 +00006856 /* check and handle 'errors' arg */
6857 if (errors==NULL || strcmp(errors, "strict")==0)
6858 flags = MB_ERR_INVALID_CHARS;
6859 else if (strcmp(errors, "ignore")==0)
6860 flags = 0;
6861 else {
6862 PyErr_Format(PyExc_ValueError,
6863 "mbcs encoding does not support errors='%s'",
6864 errors);
6865 return -1;
6866 }
6867
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 /* Skip trailing lead-byte unless 'final' is set */
6869 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871
6872 /* First get the size of the result */
6873 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006874 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6875 if (usize==0)
6876 goto mbcs_decode_error;
6877 } else
6878 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879
6880 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 /* Create unicode object */
6882 *v = _PyUnicode_New(usize);
6883 if (*v == NULL)
6884 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006885 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886 }
6887 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 /* Extend unicode object */
6889 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006890 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006892 }
6893
6894 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006895 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006897 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6898 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006902
6903mbcs_decode_error:
6904 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6905 we raise a UnicodeDecodeError - else it is a 'generic'
6906 windows error
6907 */
6908 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6909 /* Ideally, we should get reason from FormatMessage - this
6910 is the Windows 2000 English version of the message
6911 */
6912 PyObject *exc = NULL;
6913 const char *reason = "No mapping for the Unicode character exists "
6914 "in the target multi-byte code page.";
6915 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6916 if (exc != NULL) {
6917 PyCodec_StrictErrors(exc);
6918 Py_DECREF(exc);
6919 }
6920 } else {
6921 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6922 }
6923 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924}
6925
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926PyObject *
6927PyUnicode_DecodeMBCSStateful(const char *s,
6928 Py_ssize_t size,
6929 const char *errors,
6930 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931{
6932 PyUnicodeObject *v = NULL;
6933 int done;
6934
6935 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006937
6938#ifdef NEED_RETRY
6939 retry:
6940 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006941 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942 else
6943#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006944 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006945
6946 if (done < 0) {
6947 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006949 }
6950
6951 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953
6954#ifdef NEED_RETRY
6955 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 s += done;
6957 size -= done;
6958 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006959 }
6960#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006961#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006962 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006963 Py_DECREF(v);
6964 return NULL;
6965 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006966#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006967 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968 return (PyObject *)v;
6969}
6970
Alexander Belopolsky40018472011-02-26 01:02:56 +00006971PyObject *
6972PyUnicode_DecodeMBCS(const char *s,
6973 Py_ssize_t size,
6974 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006975{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6977}
6978
6979/*
6980 * Convert unicode into string object (MBCS).
6981 * Returns 0 if succeed, -1 otherwise.
6982 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006983static int
6984encode_mbcs(PyObject **repr,
6985 const Py_UNICODE *p, /* unicode */
6986 int size, /* size of unicode */
6987 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006988{
Victor Stinner554f3f02010-06-16 23:33:54 +00006989 BOOL usedDefaultChar = FALSE;
6990 BOOL *pusedDefaultChar;
6991 int mbcssize;
6992 Py_ssize_t n;
6993 PyObject *exc = NULL;
6994 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006995
6996 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006997
Victor Stinner554f3f02010-06-16 23:33:54 +00006998 /* check and handle 'errors' arg */
6999 if (errors==NULL || strcmp(errors, "strict")==0) {
7000 flags = WC_NO_BEST_FIT_CHARS;
7001 pusedDefaultChar = &usedDefaultChar;
7002 } else if (strcmp(errors, "replace")==0) {
7003 flags = 0;
7004 pusedDefaultChar = NULL;
7005 } else {
7006 PyErr_Format(PyExc_ValueError,
7007 "mbcs encoding does not support errors='%s'",
7008 errors);
7009 return -1;
7010 }
7011
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007012 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007013 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007014 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7015 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 if (mbcssize == 0) {
7017 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7018 return -1;
7019 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007020 /* If we used a default char, then we failed! */
7021 if (pusedDefaultChar && *pusedDefaultChar)
7022 goto mbcs_encode_error;
7023 } else {
7024 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007025 }
7026
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007027 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 /* Create string object */
7029 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7030 if (*repr == NULL)
7031 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007032 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007033 }
7034 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 /* Extend string object */
7036 n = PyBytes_Size(*repr);
7037 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7038 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007039 }
7040
7041 /* Do the conversion */
7042 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007044 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7045 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7047 return -1;
7048 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007049 if (pusedDefaultChar && *pusedDefaultChar)
7050 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007053
7054mbcs_encode_error:
7055 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7056 Py_XDECREF(exc);
7057 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007058}
7059
Alexander Belopolsky40018472011-02-26 01:02:56 +00007060PyObject *
7061PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7062 Py_ssize_t size,
7063 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007064{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065 PyObject *repr = NULL;
7066 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007067
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007071 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072 else
7073#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007074 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007075
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 Py_XDECREF(repr);
7078 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007079 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
7081#ifdef NEED_RETRY
7082 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 p += INT_MAX;
7084 size -= INT_MAX;
7085 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086 }
7087#endif
7088
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007089 return repr;
7090}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007091
Alexander Belopolsky40018472011-02-26 01:02:56 +00007092PyObject *
7093PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007094{
7095 if (!PyUnicode_Check(unicode)) {
7096 PyErr_BadArgument();
7097 return NULL;
7098 }
7099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 PyUnicode_GET_SIZE(unicode),
7101 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007102}
7103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104#undef NEED_RETRY
7105
Victor Stinner99b95382011-07-04 14:23:54 +02007106#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007107
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108/* --- Character Mapping Codec -------------------------------------------- */
7109
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110PyObject *
7111PyUnicode_DecodeCharmap(const char *s,
7112 Py_ssize_t size,
7113 PyObject *mapping,
7114 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007116 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007117 Py_ssize_t startinpos;
7118 Py_ssize_t endinpos;
7119 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007120 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 PyUnicodeObject *v;
7122 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007123 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007124 PyObject *errorHandler = NULL;
7125 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007126 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007127 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007128
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 /* Default to Latin-1 */
7130 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133 v = _PyUnicode_New(size);
7134 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007140 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 mapstring = PyUnicode_AS_UNICODE(mapping);
7142 maplen = PyUnicode_GET_SIZE(mapping);
7143 while (s < e) {
7144 unsigned char ch = *s;
7145 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 if (ch < maplen)
7148 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 if (x == 0xfffe) {
7151 /* undefined mapping */
7152 outpos = p-PyUnicode_AS_UNICODE(v);
7153 startinpos = s-starts;
7154 endinpos = startinpos+1;
7155 if (unicode_decode_call_errorhandler(
7156 errors, &errorHandler,
7157 "charmap", "character maps to <undefined>",
7158 &starts, &e, &startinpos, &endinpos, &exc, &s,
7159 &v, &outpos, &p)) {
7160 goto onError;
7161 }
7162 continue;
7163 }
7164 *p++ = x;
7165 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007166 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007167 }
7168 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 while (s < e) {
7170 unsigned char ch = *s;
7171 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007172
Benjamin Peterson29060642009-01-31 22:14:21 +00007173 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7174 w = PyLong_FromLong((long)ch);
7175 if (w == NULL)
7176 goto onError;
7177 x = PyObject_GetItem(mapping, w);
7178 Py_DECREF(w);
7179 if (x == NULL) {
7180 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7181 /* No mapping found means: mapping is undefined. */
7182 PyErr_Clear();
7183 x = Py_None;
7184 Py_INCREF(x);
7185 } else
7186 goto onError;
7187 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007188
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 /* Apply mapping */
7190 if (PyLong_Check(x)) {
7191 long value = PyLong_AS_LONG(x);
7192 if (value < 0 || value > 65535) {
7193 PyErr_SetString(PyExc_TypeError,
7194 "character mapping must be in range(65536)");
7195 Py_DECREF(x);
7196 goto onError;
7197 }
7198 *p++ = (Py_UNICODE)value;
7199 }
7200 else if (x == Py_None) {
7201 /* undefined mapping */
7202 outpos = p-PyUnicode_AS_UNICODE(v);
7203 startinpos = s-starts;
7204 endinpos = startinpos+1;
7205 if (unicode_decode_call_errorhandler(
7206 errors, &errorHandler,
7207 "charmap", "character maps to <undefined>",
7208 &starts, &e, &startinpos, &endinpos, &exc, &s,
7209 &v, &outpos, &p)) {
7210 Py_DECREF(x);
7211 goto onError;
7212 }
7213 Py_DECREF(x);
7214 continue;
7215 }
7216 else if (PyUnicode_Check(x)) {
7217 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007218
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 if (targetsize == 1)
7220 /* 1-1 mapping */
7221 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007222
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 else if (targetsize > 1) {
7224 /* 1-n mapping */
7225 if (targetsize > extrachars) {
7226 /* resize first */
7227 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7228 Py_ssize_t needed = (targetsize - extrachars) + \
7229 (targetsize << 2);
7230 extrachars += needed;
7231 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007232 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 PyUnicode_GET_SIZE(v) + needed) < 0) {
7234 Py_DECREF(x);
7235 goto onError;
7236 }
7237 p = PyUnicode_AS_UNICODE(v) + oldpos;
7238 }
7239 Py_UNICODE_COPY(p,
7240 PyUnicode_AS_UNICODE(x),
7241 targetsize);
7242 p += targetsize;
7243 extrachars -= targetsize;
7244 }
7245 /* 1-0 mapping: skip the character */
7246 }
7247 else {
7248 /* wrong return value */
7249 PyErr_SetString(PyExc_TypeError,
7250 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007251 Py_DECREF(x);
7252 goto onError;
7253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 Py_DECREF(x);
7255 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 }
7258 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007259 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007260 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007261 Py_XDECREF(errorHandler);
7262 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007263#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007264 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 Py_DECREF(v);
7266 return NULL;
7267 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007268#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007269 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007271
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007273 Py_XDECREF(errorHandler);
7274 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 Py_XDECREF(v);
7276 return NULL;
7277}
7278
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007279/* Charmap encoding: the lookup table */
7280
Alexander Belopolsky40018472011-02-26 01:02:56 +00007281struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 PyObject_HEAD
7283 unsigned char level1[32];
7284 int count2, count3;
7285 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007286};
7287
7288static PyObject*
7289encoding_map_size(PyObject *obj, PyObject* args)
7290{
7291 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007292 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007294}
7295
7296static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007297 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 PyDoc_STR("Return the size (in bytes) of this object") },
7299 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007300};
7301
7302static void
7303encoding_map_dealloc(PyObject* o)
7304{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007305 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007306}
7307
7308static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007309 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 "EncodingMap", /*tp_name*/
7311 sizeof(struct encoding_map), /*tp_basicsize*/
7312 0, /*tp_itemsize*/
7313 /* methods */
7314 encoding_map_dealloc, /*tp_dealloc*/
7315 0, /*tp_print*/
7316 0, /*tp_getattr*/
7317 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007318 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 0, /*tp_repr*/
7320 0, /*tp_as_number*/
7321 0, /*tp_as_sequence*/
7322 0, /*tp_as_mapping*/
7323 0, /*tp_hash*/
7324 0, /*tp_call*/
7325 0, /*tp_str*/
7326 0, /*tp_getattro*/
7327 0, /*tp_setattro*/
7328 0, /*tp_as_buffer*/
7329 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7330 0, /*tp_doc*/
7331 0, /*tp_traverse*/
7332 0, /*tp_clear*/
7333 0, /*tp_richcompare*/
7334 0, /*tp_weaklistoffset*/
7335 0, /*tp_iter*/
7336 0, /*tp_iternext*/
7337 encoding_map_methods, /*tp_methods*/
7338 0, /*tp_members*/
7339 0, /*tp_getset*/
7340 0, /*tp_base*/
7341 0, /*tp_dict*/
7342 0, /*tp_descr_get*/
7343 0, /*tp_descr_set*/
7344 0, /*tp_dictoffset*/
7345 0, /*tp_init*/
7346 0, /*tp_alloc*/
7347 0, /*tp_new*/
7348 0, /*tp_free*/
7349 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007350};
7351
7352PyObject*
7353PyUnicode_BuildEncodingMap(PyObject* string)
7354{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007355 PyObject *result;
7356 struct encoding_map *mresult;
7357 int i;
7358 int need_dict = 0;
7359 unsigned char level1[32];
7360 unsigned char level2[512];
7361 unsigned char *mlevel1, *mlevel2, *mlevel3;
7362 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007363 int kind;
7364 void *data;
7365 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007368 PyErr_BadArgument();
7369 return NULL;
7370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007371 kind = PyUnicode_KIND(string);
7372 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007373 memset(level1, 0xFF, sizeof level1);
7374 memset(level2, 0xFF, sizeof level2);
7375
7376 /* If there isn't a one-to-one mapping of NULL to \0,
7377 or if there are non-BMP characters, we need to use
7378 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007379 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007380 need_dict = 1;
7381 for (i = 1; i < 256; i++) {
7382 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007383 ch = PyUnicode_READ(kind, data, i);
7384 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007385 need_dict = 1;
7386 break;
7387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007388 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007389 /* unmapped character */
7390 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 l1 = ch >> 11;
7392 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007393 if (level1[l1] == 0xFF)
7394 level1[l1] = count2++;
7395 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007396 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007397 }
7398
7399 if (count2 >= 0xFF || count3 >= 0xFF)
7400 need_dict = 1;
7401
7402 if (need_dict) {
7403 PyObject *result = PyDict_New();
7404 PyObject *key, *value;
7405 if (!result)
7406 return NULL;
7407 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007408 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007409 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007410 if (!key || !value)
7411 goto failed1;
7412 if (PyDict_SetItem(result, key, value) == -1)
7413 goto failed1;
7414 Py_DECREF(key);
7415 Py_DECREF(value);
7416 }
7417 return result;
7418 failed1:
7419 Py_XDECREF(key);
7420 Py_XDECREF(value);
7421 Py_DECREF(result);
7422 return NULL;
7423 }
7424
7425 /* Create a three-level trie */
7426 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7427 16*count2 + 128*count3 - 1);
7428 if (!result)
7429 return PyErr_NoMemory();
7430 PyObject_Init(result, &EncodingMapType);
7431 mresult = (struct encoding_map*)result;
7432 mresult->count2 = count2;
7433 mresult->count3 = count3;
7434 mlevel1 = mresult->level1;
7435 mlevel2 = mresult->level23;
7436 mlevel3 = mresult->level23 + 16*count2;
7437 memcpy(mlevel1, level1, 32);
7438 memset(mlevel2, 0xFF, 16*count2);
7439 memset(mlevel3, 0, 128*count3);
7440 count3 = 0;
7441 for (i = 1; i < 256; i++) {
7442 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007444 /* unmapped character */
7445 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007446 o1 = PyUnicode_READ(kind, data, i)>>11;
7447 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007448 i2 = 16*mlevel1[o1] + o2;
7449 if (mlevel2[i2] == 0xFF)
7450 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007451 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007452 i3 = 128*mlevel2[i2] + o3;
7453 mlevel3[i3] = i;
7454 }
7455 return result;
7456}
7457
7458static int
7459encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7460{
7461 struct encoding_map *map = (struct encoding_map*)mapping;
7462 int l1 = c>>11;
7463 int l2 = (c>>7) & 0xF;
7464 int l3 = c & 0x7F;
7465 int i;
7466
7467#ifdef Py_UNICODE_WIDE
7468 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007470 }
7471#endif
7472 if (c == 0)
7473 return 0;
7474 /* level 1*/
7475 i = map->level1[l1];
7476 if (i == 0xFF) {
7477 return -1;
7478 }
7479 /* level 2*/
7480 i = map->level23[16*i+l2];
7481 if (i == 0xFF) {
7482 return -1;
7483 }
7484 /* level 3 */
7485 i = map->level23[16*map->count2 + 128*i + l3];
7486 if (i == 0) {
7487 return -1;
7488 }
7489 return i;
7490}
7491
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007492/* Lookup the character ch in the mapping. If the character
7493 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007494 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007495static PyObject *
7496charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497{
Christian Heimes217cfd12007-12-02 14:31:20 +00007498 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007499 PyObject *x;
7500
7501 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503 x = PyObject_GetItem(mapping, w);
7504 Py_DECREF(w);
7505 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7507 /* No mapping found means: mapping is undefined. */
7508 PyErr_Clear();
7509 x = Py_None;
7510 Py_INCREF(x);
7511 return x;
7512 } else
7513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007515 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007517 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 long value = PyLong_AS_LONG(x);
7519 if (value < 0 || value > 255) {
7520 PyErr_SetString(PyExc_TypeError,
7521 "character mapping must be in range(256)");
7522 Py_DECREF(x);
7523 return NULL;
7524 }
7525 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007527 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 /* wrong return value */
7531 PyErr_Format(PyExc_TypeError,
7532 "character mapping must return integer, bytes or None, not %.400s",
7533 x->ob_type->tp_name);
7534 Py_DECREF(x);
7535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 }
7537}
7538
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007539static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007540charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007541{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007542 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7543 /* exponentially overallocate to minimize reallocations */
7544 if (requiredsize < 2*outsize)
7545 requiredsize = 2*outsize;
7546 if (_PyBytes_Resize(outobj, requiredsize))
7547 return -1;
7548 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007549}
7550
Benjamin Peterson14339b62009-01-31 16:36:08 +00007551typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007553} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007554/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007555 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007556 space is available. Return a new reference to the object that
7557 was put in the output buffer, or Py_None, if the mapping was undefined
7558 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007559 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007560static charmapencode_result
7561charmapencode_output(Py_UNICODE c, PyObject *mapping,
7562 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007563{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007564 PyObject *rep;
7565 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007566 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007567
Christian Heimes90aa7642007-12-19 02:45:37 +00007568 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007569 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007571 if (res == -1)
7572 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 if (outsize<requiredsize)
7574 if (charmapencode_resize(outobj, outpos, requiredsize))
7575 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007576 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 outstart[(*outpos)++] = (char)res;
7578 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007579 }
7580
7581 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007582 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007584 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 Py_DECREF(rep);
7586 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007587 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 if (PyLong_Check(rep)) {
7589 Py_ssize_t requiredsize = *outpos+1;
7590 if (outsize<requiredsize)
7591 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7592 Py_DECREF(rep);
7593 return enc_EXCEPTION;
7594 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007595 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 else {
7599 const char *repchars = PyBytes_AS_STRING(rep);
7600 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7601 Py_ssize_t requiredsize = *outpos+repsize;
7602 if (outsize<requiredsize)
7603 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7604 Py_DECREF(rep);
7605 return enc_EXCEPTION;
7606 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007607 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 memcpy(outstart + *outpos, repchars, repsize);
7609 *outpos += repsize;
7610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007611 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007612 Py_DECREF(rep);
7613 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007614}
7615
7616/* handle an error in PyUnicode_EncodeCharmap
7617 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007618static int
7619charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007620 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007621 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007622 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007623 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007624{
7625 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007626 Py_ssize_t repsize;
7627 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628 Py_UNICODE *uni2;
7629 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007630 Py_ssize_t collstartpos = *inpos;
7631 Py_ssize_t collendpos = *inpos+1;
7632 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 char *encoding = "charmap";
7634 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007635 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637 /* find all unencodable characters */
7638 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007639 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007640 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 int res = encoding_map_lookup(p[collendpos], mapping);
7642 if (res != -1)
7643 break;
7644 ++collendpos;
7645 continue;
7646 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007647
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 rep = charmapencode_lookup(p[collendpos], mapping);
7649 if (rep==NULL)
7650 return -1;
7651 else if (rep!=Py_None) {
7652 Py_DECREF(rep);
7653 break;
7654 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007655 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007657 }
7658 /* cache callback name lookup
7659 * (if not done yet, i.e. it's the first error) */
7660 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 if ((errors==NULL) || (!strcmp(errors, "strict")))
7662 *known_errorHandler = 1;
7663 else if (!strcmp(errors, "replace"))
7664 *known_errorHandler = 2;
7665 else if (!strcmp(errors, "ignore"))
7666 *known_errorHandler = 3;
7667 else if (!strcmp(errors, "xmlcharrefreplace"))
7668 *known_errorHandler = 4;
7669 else
7670 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671 }
7672 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007673 case 1: /* strict */
7674 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7675 return -1;
7676 case 2: /* replace */
7677 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 x = charmapencode_output('?', mapping, res, respos);
7679 if (x==enc_EXCEPTION) {
7680 return -1;
7681 }
7682 else if (x==enc_FAILED) {
7683 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7684 return -1;
7685 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686 }
7687 /* fall through */
7688 case 3: /* ignore */
7689 *inpos = collendpos;
7690 break;
7691 case 4: /* xmlcharrefreplace */
7692 /* generate replacement (temporarily (mis)uses p) */
7693 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 char buffer[2+29+1+1];
7695 char *cp;
7696 sprintf(buffer, "&#%d;", (int)p[collpos]);
7697 for (cp = buffer; *cp; ++cp) {
7698 x = charmapencode_output(*cp, mapping, res, respos);
7699 if (x==enc_EXCEPTION)
7700 return -1;
7701 else if (x==enc_FAILED) {
7702 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7703 return -1;
7704 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007705 }
7706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 *inpos = collendpos;
7708 break;
7709 default:
7710 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 encoding, reason, p, size, exceptionObject,
7712 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007713 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007715 if (PyBytes_Check(repunicode)) {
7716 /* Directly copy bytes result to output. */
7717 Py_ssize_t outsize = PyBytes_Size(*res);
7718 Py_ssize_t requiredsize;
7719 repsize = PyBytes_Size(repunicode);
7720 requiredsize = *respos + repsize;
7721 if (requiredsize > outsize)
7722 /* Make room for all additional bytes. */
7723 if (charmapencode_resize(res, respos, requiredsize)) {
7724 Py_DECREF(repunicode);
7725 return -1;
7726 }
7727 memcpy(PyBytes_AsString(*res) + *respos,
7728 PyBytes_AsString(repunicode), repsize);
7729 *respos += repsize;
7730 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007731 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007732 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007734 /* generate replacement */
7735 repsize = PyUnicode_GET_SIZE(repunicode);
7736 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 x = charmapencode_output(*uni2, mapping, res, respos);
7738 if (x==enc_EXCEPTION) {
7739 return -1;
7740 }
7741 else if (x==enc_FAILED) {
7742 Py_DECREF(repunicode);
7743 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7744 return -1;
7745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 }
7747 *inpos = newpos;
7748 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007749 }
7750 return 0;
7751}
7752
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753PyObject *
7754PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7755 Py_ssize_t size,
7756 PyObject *mapping,
7757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007759 /* output object */
7760 PyObject *res = NULL;
7761 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007762 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007764 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007765 PyObject *errorHandler = NULL;
7766 PyObject *exc = NULL;
7767 /* the following variable is used for caching string comparisons
7768 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7769 * 3=ignore, 4=xmlcharrefreplace */
7770 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
7772 /* Default to Latin-1 */
7773 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007776 /* allocate enough for a simple encoding without
7777 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007778 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007779 if (res == NULL)
7780 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007781 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007784 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 /* try to encode it */
7786 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7787 if (x==enc_EXCEPTION) /* error */
7788 goto onError;
7789 if (x==enc_FAILED) { /* unencodable character */
7790 if (charmap_encoding_error(p, size, &inpos, mapping,
7791 &exc,
7792 &known_errorHandler, &errorHandler, errors,
7793 &res, &respos)) {
7794 goto onError;
7795 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007796 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 else
7798 /* done with this character => adjust input position */
7799 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007802 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007803 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007804 if (_PyBytes_Resize(&res, respos) < 0)
7805 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007807 Py_XDECREF(exc);
7808 Py_XDECREF(errorHandler);
7809 return res;
7810
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812 Py_XDECREF(res);
7813 Py_XDECREF(exc);
7814 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 return NULL;
7816}
7817
Alexander Belopolsky40018472011-02-26 01:02:56 +00007818PyObject *
7819PyUnicode_AsCharmapString(PyObject *unicode,
7820 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821{
7822 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 PyErr_BadArgument();
7824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 }
7826 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 PyUnicode_GET_SIZE(unicode),
7828 mapping,
7829 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830}
7831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007832/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007833static void
7834make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007836 Py_ssize_t startpos, Py_ssize_t endpos,
7837 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 *exceptionObject = _PyUnicodeTranslateError_Create(
7841 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 }
7843 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7845 goto onError;
7846 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7847 goto onError;
7848 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7849 goto onError;
7850 return;
7851 onError:
7852 Py_DECREF(*exceptionObject);
7853 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 }
7855}
7856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007858static void
7859raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007861 Py_ssize_t startpos, Py_ssize_t endpos,
7862 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863{
7864 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868}
7869
7870/* error handling callback helper:
7871 build arguments, call the callback and check the arguments,
7872 put the result into newpos and return the replacement string, which
7873 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007874static PyObject *
7875unicode_translate_call_errorhandler(const char *errors,
7876 PyObject **errorHandler,
7877 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007878 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007879 Py_ssize_t startpos, Py_ssize_t endpos,
7880 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007882 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007883
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007884 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885 PyObject *restuple;
7886 PyObject *resunicode;
7887
7888 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007890 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007892 }
7893
7894 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007896 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007898
7899 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007901 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007903 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007904 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_DECREF(restuple);
7906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007907 }
7908 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 &resunicode, &i_newpos)) {
7910 Py_DECREF(restuple);
7911 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007912 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007913 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007915 else
7916 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7919 Py_DECREF(restuple);
7920 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007921 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007922 Py_INCREF(resunicode);
7923 Py_DECREF(restuple);
7924 return resunicode;
7925}
7926
7927/* Lookup the character ch in the mapping and put the result in result,
7928 which must be decrefed by the caller.
7929 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007930static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007932{
Christian Heimes217cfd12007-12-02 14:31:20 +00007933 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934 PyObject *x;
7935
7936 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938 x = PyObject_GetItem(mapping, w);
7939 Py_DECREF(w);
7940 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7942 /* No mapping found means: use 1:1 mapping. */
7943 PyErr_Clear();
7944 *result = NULL;
7945 return 0;
7946 } else
7947 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007948 }
7949 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 *result = x;
7951 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007953 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 long value = PyLong_AS_LONG(x);
7955 long max = PyUnicode_GetMax();
7956 if (value < 0 || value > max) {
7957 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007958 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 Py_DECREF(x);
7960 return -1;
7961 }
7962 *result = x;
7963 return 0;
7964 }
7965 else if (PyUnicode_Check(x)) {
7966 *result = x;
7967 return 0;
7968 }
7969 else {
7970 /* wrong return value */
7971 PyErr_SetString(PyExc_TypeError,
7972 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007973 Py_DECREF(x);
7974 return -1;
7975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976}
7977/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 if not reallocate and adjust various state variables.
7979 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007980static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007985 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 /* exponentially overallocate to minimize reallocations */
7987 if (requiredsize < 2 * oldsize)
7988 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007989 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7990 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007992 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007993 }
7994 return 0;
7995}
7996/* lookup the character, put the result in the output string and adjust
7997 various state variables. Return a new reference to the object that
7998 was put in the output buffer in *result, or Py_None, if the mapping was
7999 undefined (in which case no character was written).
8000 The called must decref result.
8001 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008002static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008003charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8004 PyObject *mapping, Py_UCS4 **output,
8005 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008006 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008008 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8009 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008010 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008013 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008014 }
8015 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008017 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008019 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020 }
8021 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008022 Py_ssize_t repsize;
8023 if (PyUnicode_READY(*res) == -1)
8024 return -1;
8025 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 if (repsize==1) {
8027 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008028 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 }
8030 else if (repsize!=0) {
8031 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032 Py_ssize_t requiredsize = *opos +
8033 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008035 Py_ssize_t i;
8036 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 for(i = 0; i < repsize; i++)
8039 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041 }
8042 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044 return 0;
8045}
8046
Alexander Belopolsky40018472011-02-26 01:02:56 +00008047PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008048_PyUnicode_TranslateCharmap(PyObject *input,
8049 PyObject *mapping,
8050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008052 /* input object */
8053 char *idata;
8054 Py_ssize_t size, i;
8055 int kind;
8056 /* output buffer */
8057 Py_UCS4 *output = NULL;
8058 Py_ssize_t osize;
8059 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008060 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008062 char *reason = "character maps to <undefined>";
8063 PyObject *errorHandler = NULL;
8064 PyObject *exc = NULL;
8065 /* the following variable is used for caching string comparisons
8066 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8067 * 3=ignore, 4=xmlcharrefreplace */
8068 int known_errorHandler = -1;
8069
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 PyErr_BadArgument();
8072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 if (PyUnicode_READY(input) == -1)
8076 return NULL;
8077 idata = (char*)PyUnicode_DATA(input);
8078 kind = PyUnicode_KIND(input);
8079 size = PyUnicode_GET_LENGTH(input);
8080 i = 0;
8081
8082 if (size == 0) {
8083 Py_INCREF(input);
8084 return input;
8085 }
8086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087 /* allocate enough for a simple 1:1 translation without
8088 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 osize = size;
8090 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8091 opos = 0;
8092 if (output == NULL) {
8093 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008097 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 /* try to encode it */
8099 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 if (charmaptranslate_output(input, i, mapping,
8101 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 Py_XDECREF(x);
8103 goto onError;
8104 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 else { /* untranslatable character */
8109 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8110 Py_ssize_t repsize;
8111 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 Py_ssize_t collstart = i;
8115 Py_ssize_t collend = i+1;
8116 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 while (collend < size) {
8120 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 goto onError;
8122 Py_XDECREF(x);
8123 if (x!=Py_None)
8124 break;
8125 ++collend;
8126 }
8127 /* cache callback name lookup
8128 * (if not done yet, i.e. it's the first error) */
8129 if (known_errorHandler==-1) {
8130 if ((errors==NULL) || (!strcmp(errors, "strict")))
8131 known_errorHandler = 1;
8132 else if (!strcmp(errors, "replace"))
8133 known_errorHandler = 2;
8134 else if (!strcmp(errors, "ignore"))
8135 known_errorHandler = 3;
8136 else if (!strcmp(errors, "xmlcharrefreplace"))
8137 known_errorHandler = 4;
8138 else
8139 known_errorHandler = 0;
8140 }
8141 switch (known_errorHandler) {
8142 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 raise_translate_exception(&exc, input, collstart,
8144 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 case 2: /* replace */
8147 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008148 for (coll = collstart; coll<collend; coll++)
8149 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 /* fall through */
8151 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 break;
8154 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 /* generate replacement (temporarily (mis)uses i) */
8156 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 char buffer[2+29+1+1];
8158 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8160 if (charmaptranslate_makespace(&output, &osize,
8161 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 goto onError;
8163 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 break;
8168 default:
8169 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008170 reason, input, &exc,
8171 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008172 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 goto onError;
8174 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 repsize = PyUnicode_GET_LENGTH(repunicode);
8176 if (charmaptranslate_makespace(&output, &osize,
8177 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 Py_DECREF(repunicode);
8179 goto onError;
8180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 for (uni2 = 0; repsize-->0; ++uni2)
8182 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8183 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
8187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8189 if (!res)
8190 goto onError;
8191 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008192 Py_XDECREF(exc);
8193 Py_XDECREF(errorHandler);
8194 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 Py_XDECREF(exc);
8199 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 return NULL;
8201}
8202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203/* Deprecated. Use PyUnicode_Translate instead. */
8204PyObject *
8205PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8206 Py_ssize_t size,
8207 PyObject *mapping,
8208 const char *errors)
8209{
8210 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8211 if (!unicode)
8212 return NULL;
8213 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8214}
8215
Alexander Belopolsky40018472011-02-26 01:02:56 +00008216PyObject *
8217PyUnicode_Translate(PyObject *str,
8218 PyObject *mapping,
8219 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220{
8221 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008222
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 str = PyUnicode_FromObject(str);
8224 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008226 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 Py_DECREF(str);
8228 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008229
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 Py_XDECREF(str);
8232 return NULL;
8233}
Tim Petersced69f82003-09-16 20:30:58 +00008234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008236fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237{
8238 /* No need to call PyUnicode_READY(self) because this function is only
8239 called as a callback from fixup() which does it already. */
8240 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8241 const int kind = PyUnicode_KIND(self);
8242 void *data = PyUnicode_DATA(self);
8243 Py_UCS4 maxchar = 0, ch, fixed;
8244 Py_ssize_t i;
8245
8246 for (i = 0; i < len; ++i) {
8247 ch = PyUnicode_READ(kind, data, i);
8248 fixed = 0;
8249 if (ch > 127) {
8250 if (Py_UNICODE_ISSPACE(ch))
8251 fixed = ' ';
8252 else {
8253 const int decimal = Py_UNICODE_TODECIMAL(ch);
8254 if (decimal >= 0)
8255 fixed = '0' + decimal;
8256 }
8257 if (fixed != 0) {
8258 if (fixed > maxchar)
8259 maxchar = fixed;
8260 PyUnicode_WRITE(kind, data, i, fixed);
8261 }
8262 else if (ch > maxchar)
8263 maxchar = ch;
8264 }
8265 else if (ch > maxchar)
8266 maxchar = ch;
8267 }
8268
8269 return maxchar;
8270}
8271
8272PyObject *
8273_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8274{
8275 if (!PyUnicode_Check(unicode)) {
8276 PyErr_BadInternalCall();
8277 return NULL;
8278 }
8279 if (PyUnicode_READY(unicode) == -1)
8280 return NULL;
8281 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8282 /* If the string is already ASCII, just return the same string */
8283 Py_INCREF(unicode);
8284 return unicode;
8285 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008286 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287}
8288
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008289PyObject *
8290PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8291 Py_ssize_t length)
8292{
8293 PyObject *result;
8294 Py_UNICODE *p; /* write pointer into result */
8295 Py_ssize_t i;
8296 /* Copy to a new string */
8297 result = (PyObject *)_PyUnicode_New(length);
8298 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8299 if (result == NULL)
8300 return result;
8301 p = PyUnicode_AS_UNICODE(result);
8302 /* Iterate over code points */
8303 for (i = 0; i < length; i++) {
8304 Py_UNICODE ch =s[i];
8305 if (ch > 127) {
8306 int decimal = Py_UNICODE_TODECIMAL(ch);
8307 if (decimal >= 0)
8308 p[i] = '0' + decimal;
8309 }
8310 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008311#ifndef DONT_MAKE_RESULT_READY
8312 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 Py_DECREF(result);
8314 return NULL;
8315 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008316#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008317 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008318 return result;
8319}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008320/* --- Decimal Encoder ---------------------------------------------------- */
8321
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322int
8323PyUnicode_EncodeDecimal(Py_UNICODE *s,
8324 Py_ssize_t length,
8325 char *output,
8326 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008327{
8328 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 PyObject *errorHandler = NULL;
8330 PyObject *exc = NULL;
8331 const char *encoding = "decimal";
8332 const char *reason = "invalid decimal Unicode string";
8333 /* the following variable is used for caching string comparisons
8334 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8335 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008336
8337 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 PyErr_BadArgument();
8339 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008340 }
8341
8342 p = s;
8343 end = s + length;
8344 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 register Py_UNICODE ch = *p;
8346 int decimal;
8347 PyObject *repunicode;
8348 Py_ssize_t repsize;
8349 Py_ssize_t newpos;
8350 Py_UNICODE *uni2;
8351 Py_UNICODE *collstart;
8352 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008353
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 ++p;
8357 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 decimal = Py_UNICODE_TODECIMAL(ch);
8360 if (decimal >= 0) {
8361 *output++ = '0' + decimal;
8362 ++p;
8363 continue;
8364 }
8365 if (0 < ch && ch < 256) {
8366 *output++ = (char)ch;
8367 ++p;
8368 continue;
8369 }
8370 /* All other characters are considered unencodable */
8371 collstart = p;
8372 collend = p+1;
8373 while (collend < end) {
8374 if ((0 < *collend && *collend < 256) ||
8375 !Py_UNICODE_ISSPACE(*collend) ||
8376 Py_UNICODE_TODECIMAL(*collend))
8377 break;
8378 }
8379 /* cache callback name lookup
8380 * (if not done yet, i.e. it's the first error) */
8381 if (known_errorHandler==-1) {
8382 if ((errors==NULL) || (!strcmp(errors, "strict")))
8383 known_errorHandler = 1;
8384 else if (!strcmp(errors, "replace"))
8385 known_errorHandler = 2;
8386 else if (!strcmp(errors, "ignore"))
8387 known_errorHandler = 3;
8388 else if (!strcmp(errors, "xmlcharrefreplace"))
8389 known_errorHandler = 4;
8390 else
8391 known_errorHandler = 0;
8392 }
8393 switch (known_errorHandler) {
8394 case 1: /* strict */
8395 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8396 goto onError;
8397 case 2: /* replace */
8398 for (p = collstart; p < collend; ++p)
8399 *output++ = '?';
8400 /* fall through */
8401 case 3: /* ignore */
8402 p = collend;
8403 break;
8404 case 4: /* xmlcharrefreplace */
8405 /* generate replacement (temporarily (mis)uses p) */
8406 for (p = collstart; p < collend; ++p)
8407 output += sprintf(output, "&#%d;", (int)*p);
8408 p = collend;
8409 break;
8410 default:
8411 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8412 encoding, reason, s, length, &exc,
8413 collstart-s, collend-s, &newpos);
8414 if (repunicode == NULL)
8415 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008416 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008417 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008418 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8419 Py_DECREF(repunicode);
8420 goto onError;
8421 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 /* generate replacement */
8423 repsize = PyUnicode_GET_SIZE(repunicode);
8424 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8425 Py_UNICODE ch = *uni2;
8426 if (Py_UNICODE_ISSPACE(ch))
8427 *output++ = ' ';
8428 else {
8429 decimal = Py_UNICODE_TODECIMAL(ch);
8430 if (decimal >= 0)
8431 *output++ = '0' + decimal;
8432 else if (0 < ch && ch < 256)
8433 *output++ = (char)ch;
8434 else {
8435 Py_DECREF(repunicode);
8436 raise_encode_exception(&exc, encoding,
8437 s, length, collstart-s, collend-s, reason);
8438 goto onError;
8439 }
8440 }
8441 }
8442 p = s + newpos;
8443 Py_DECREF(repunicode);
8444 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008445 }
8446 /* 0-terminate the output string */
8447 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 Py_XDECREF(exc);
8449 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008450 return 0;
8451
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453 Py_XDECREF(exc);
8454 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008455 return -1;
8456}
8457
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458/* --- Helpers ------------------------------------------------------------ */
8459
Victor Stinnerc3cec782011-10-05 21:24:08 +02008460#include "stringlib/asciilib.h"
8461#include "stringlib/fastsearch.h"
8462#include "stringlib/partition.h"
8463#include "stringlib/split.h"
8464#include "stringlib/count.h"
8465#include "stringlib/find.h"
8466#include "stringlib/localeutil.h"
8467#include "stringlib/undef.h"
8468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469#include "stringlib/ucs1lib.h"
8470#include "stringlib/fastsearch.h"
8471#include "stringlib/partition.h"
8472#include "stringlib/split.h"
8473#include "stringlib/count.h"
8474#include "stringlib/find.h"
8475#include "stringlib/localeutil.h"
8476#include "stringlib/undef.h"
8477
8478#include "stringlib/ucs2lib.h"
8479#include "stringlib/fastsearch.h"
8480#include "stringlib/partition.h"
8481#include "stringlib/split.h"
8482#include "stringlib/count.h"
8483#include "stringlib/find.h"
8484#include "stringlib/localeutil.h"
8485#include "stringlib/undef.h"
8486
8487#include "stringlib/ucs4lib.h"
8488#include "stringlib/fastsearch.h"
8489#include "stringlib/partition.h"
8490#include "stringlib/split.h"
8491#include "stringlib/count.h"
8492#include "stringlib/find.h"
8493#include "stringlib/localeutil.h"
8494#include "stringlib/undef.h"
8495
8496static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008497any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8498 const Py_UCS1*, Py_ssize_t,
8499 Py_ssize_t, Py_ssize_t),
8500 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 const Py_UCS1*, Py_ssize_t,
8502 Py_ssize_t, Py_ssize_t),
8503 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8504 const Py_UCS2*, Py_ssize_t,
8505 Py_ssize_t, Py_ssize_t),
8506 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8507 const Py_UCS4*, Py_ssize_t,
8508 Py_ssize_t, Py_ssize_t),
8509 PyObject* s1, PyObject* s2,
8510 Py_ssize_t start,
8511 Py_ssize_t end)
8512{
8513 int kind1, kind2, kind;
8514 void *buf1, *buf2;
8515 Py_ssize_t len1, len2, result;
8516
8517 kind1 = PyUnicode_KIND(s1);
8518 kind2 = PyUnicode_KIND(s2);
8519 kind = kind1 > kind2 ? kind1 : kind2;
8520 buf1 = PyUnicode_DATA(s1);
8521 buf2 = PyUnicode_DATA(s2);
8522 if (kind1 != kind)
8523 buf1 = _PyUnicode_AsKind(s1, kind);
8524 if (!buf1)
8525 return -2;
8526 if (kind2 != kind)
8527 buf2 = _PyUnicode_AsKind(s2, kind);
8528 if (!buf2) {
8529 if (kind1 != kind) PyMem_Free(buf1);
8530 return -2;
8531 }
8532 len1 = PyUnicode_GET_LENGTH(s1);
8533 len2 = PyUnicode_GET_LENGTH(s2);
8534
8535 switch(kind) {
8536 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008537 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8538 result = ascii(buf1, len1, buf2, len2, start, end);
8539 else
8540 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 break;
8542 case PyUnicode_2BYTE_KIND:
8543 result = ucs2(buf1, len1, buf2, len2, start, end);
8544 break;
8545 case PyUnicode_4BYTE_KIND:
8546 result = ucs4(buf1, len1, buf2, len2, start, end);
8547 break;
8548 default:
8549 assert(0); result = -2;
8550 }
8551
8552 if (kind1 != kind)
8553 PyMem_Free(buf1);
8554 if (kind2 != kind)
8555 PyMem_Free(buf2);
8556
8557 return result;
8558}
8559
8560Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008561_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 Py_ssize_t n_buffer,
8563 void *digits, Py_ssize_t n_digits,
8564 Py_ssize_t min_width,
8565 const char *grouping,
8566 const char *thousands_sep)
8567{
8568 switch(kind) {
8569 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008570 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8571 return _PyUnicode_ascii_InsertThousandsGrouping(
8572 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8573 min_width, grouping, thousands_sep);
8574 else
8575 return _PyUnicode_ucs1_InsertThousandsGrouping(
8576 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8577 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 case PyUnicode_2BYTE_KIND:
8579 return _PyUnicode_ucs2_InsertThousandsGrouping(
8580 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8581 min_width, grouping, thousands_sep);
8582 case PyUnicode_4BYTE_KIND:
8583 return _PyUnicode_ucs4_InsertThousandsGrouping(
8584 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8585 min_width, grouping, thousands_sep);
8586 }
8587 assert(0);
8588 return -1;
8589}
8590
8591
Eric Smith8c663262007-08-25 02:26:07 +00008592#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008593#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008594
Thomas Wouters477c8d52006-05-27 19:21:47 +00008595#include "stringlib/count.h"
8596#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008597
Thomas Wouters477c8d52006-05-27 19:21:47 +00008598/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008599#define ADJUST_INDICES(start, end, len) \
8600 if (end > len) \
8601 end = len; \
8602 else if (end < 0) { \
8603 end += len; \
8604 if (end < 0) \
8605 end = 0; \
8606 } \
8607 if (start < 0) { \
8608 start += len; \
8609 if (start < 0) \
8610 start = 0; \
8611 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008612
Alexander Belopolsky40018472011-02-26 01:02:56 +00008613Py_ssize_t
8614PyUnicode_Count(PyObject *str,
8615 PyObject *substr,
8616 Py_ssize_t start,
8617 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008619 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008620 PyUnicodeObject* str_obj;
8621 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 int kind1, kind2, kind;
8623 void *buf1 = NULL, *buf2 = NULL;
8624 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008625
Thomas Wouters477c8d52006-05-27 19:21:47 +00008626 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008629 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008630 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 Py_DECREF(str_obj);
8632 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 }
Tim Petersced69f82003-09-16 20:30:58 +00008634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 kind1 = PyUnicode_KIND(str_obj);
8636 kind2 = PyUnicode_KIND(sub_obj);
8637 kind = kind1 > kind2 ? kind1 : kind2;
8638 buf1 = PyUnicode_DATA(str_obj);
8639 if (kind1 != kind)
8640 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8641 if (!buf1)
8642 goto onError;
8643 buf2 = PyUnicode_DATA(sub_obj);
8644 if (kind2 != kind)
8645 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8646 if (!buf2)
8647 goto onError;
8648 len1 = PyUnicode_GET_LENGTH(str_obj);
8649 len2 = PyUnicode_GET_LENGTH(sub_obj);
8650
8651 ADJUST_INDICES(start, end, len1);
8652 switch(kind) {
8653 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008654 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8655 result = asciilib_count(
8656 ((Py_UCS1*)buf1) + start, end - start,
8657 buf2, len2, PY_SSIZE_T_MAX
8658 );
8659 else
8660 result = ucs1lib_count(
8661 ((Py_UCS1*)buf1) + start, end - start,
8662 buf2, len2, PY_SSIZE_T_MAX
8663 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 break;
8665 case PyUnicode_2BYTE_KIND:
8666 result = ucs2lib_count(
8667 ((Py_UCS2*)buf1) + start, end - start,
8668 buf2, len2, PY_SSIZE_T_MAX
8669 );
8670 break;
8671 case PyUnicode_4BYTE_KIND:
8672 result = ucs4lib_count(
8673 ((Py_UCS4*)buf1) + start, end - start,
8674 buf2, len2, PY_SSIZE_T_MAX
8675 );
8676 break;
8677 default:
8678 assert(0); result = 0;
8679 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008680
8681 Py_DECREF(sub_obj);
8682 Py_DECREF(str_obj);
8683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 if (kind1 != kind)
8685 PyMem_Free(buf1);
8686 if (kind2 != kind)
8687 PyMem_Free(buf2);
8688
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 onError:
8691 Py_DECREF(sub_obj);
8692 Py_DECREF(str_obj);
8693 if (kind1 != kind && buf1)
8694 PyMem_Free(buf1);
8695 if (kind2 != kind && buf2)
8696 PyMem_Free(buf2);
8697 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698}
8699
Alexander Belopolsky40018472011-02-26 01:02:56 +00008700Py_ssize_t
8701PyUnicode_Find(PyObject *str,
8702 PyObject *sub,
8703 Py_ssize_t start,
8704 Py_ssize_t end,
8705 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008707 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008708
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008712 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 Py_DECREF(str);
8715 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 }
Tim Petersced69f82003-09-16 20:30:58 +00008717
Thomas Wouters477c8d52006-05-27 19:21:47 +00008718 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008720 asciilib_find_slice, ucs1lib_find_slice,
8721 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008723 );
8724 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008726 asciilib_find_slice, ucs1lib_rfind_slice,
8727 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008729 );
8730
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008732 Py_DECREF(sub);
8733
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 return result;
8735}
8736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737Py_ssize_t
8738PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8739 Py_ssize_t start, Py_ssize_t end,
8740 int direction)
8741{
8742 char *result;
8743 int kind;
8744 if (PyUnicode_READY(str) == -1)
8745 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008746 if (start < 0 || end < 0) {
8747 PyErr_SetString(PyExc_IndexError, "string index out of range");
8748 return -2;
8749 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 if (end > PyUnicode_GET_LENGTH(str))
8751 end = PyUnicode_GET_LENGTH(str);
8752 kind = PyUnicode_KIND(str);
8753 result = findchar(PyUnicode_1BYTE_DATA(str)
8754 + PyUnicode_KIND_SIZE(kind, start),
8755 kind,
8756 end-start, ch, direction);
8757 if (!result)
8758 return -1;
8759 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8760}
8761
Alexander Belopolsky40018472011-02-26 01:02:56 +00008762static int
8763tailmatch(PyUnicodeObject *self,
8764 PyUnicodeObject *substring,
8765 Py_ssize_t start,
8766 Py_ssize_t end,
8767 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 int kind_self;
8770 int kind_sub;
8771 void *data_self;
8772 void *data_sub;
8773 Py_ssize_t offset;
8774 Py_ssize_t i;
8775 Py_ssize_t end_sub;
8776
8777 if (PyUnicode_READY(self) == -1 ||
8778 PyUnicode_READY(substring) == -1)
8779 return 0;
8780
8781 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 return 1;
8783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8785 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 kind_self = PyUnicode_KIND(self);
8790 data_self = PyUnicode_DATA(self);
8791 kind_sub = PyUnicode_KIND(substring);
8792 data_sub = PyUnicode_DATA(substring);
8793 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8794
8795 if (direction > 0)
8796 offset = end;
8797 else
8798 offset = start;
8799
8800 if (PyUnicode_READ(kind_self, data_self, offset) ==
8801 PyUnicode_READ(kind_sub, data_sub, 0) &&
8802 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8803 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8804 /* If both are of the same kind, memcmp is sufficient */
8805 if (kind_self == kind_sub) {
8806 return ! memcmp((char *)data_self +
8807 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8808 data_sub,
8809 PyUnicode_GET_LENGTH(substring) *
8810 PyUnicode_CHARACTER_SIZE(substring));
8811 }
8812 /* otherwise we have to compare each character by first accesing it */
8813 else {
8814 /* We do not need to compare 0 and len(substring)-1 because
8815 the if statement above ensured already that they are equal
8816 when we end up here. */
8817 // TODO: honor direction and do a forward or backwards search
8818 for (i = 1; i < end_sub; ++i) {
8819 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8820 PyUnicode_READ(kind_sub, data_sub, i))
8821 return 0;
8822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 }
8826
8827 return 0;
8828}
8829
Alexander Belopolsky40018472011-02-26 01:02:56 +00008830Py_ssize_t
8831PyUnicode_Tailmatch(PyObject *str,
8832 PyObject *substr,
8833 Py_ssize_t start,
8834 Py_ssize_t end,
8835 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008837 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008838
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 str = PyUnicode_FromObject(str);
8840 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 substr = PyUnicode_FromObject(substr);
8843 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 Py_DECREF(str);
8845 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 }
Tim Petersced69f82003-09-16 20:30:58 +00008847
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 (PyUnicodeObject *)substr,
8850 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 Py_DECREF(str);
8852 Py_DECREF(substr);
8853 return result;
8854}
8855
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856/* Apply fixfct filter to the Unicode object self and return a
8857 reference to the modified object */
8858
Alexander Belopolsky40018472011-02-26 01:02:56 +00008859static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008860fixup(PyObject *self,
8861 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 PyObject *u;
8864 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 if (PyUnicode_READY(self) == -1)
8867 return NULL;
8868 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8869 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8870 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8875 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 /* fix functions return the new maximum character in a string,
8878 if the kind of the resulting unicode object does not change,
8879 everything is fine. Otherwise we need to change the string kind
8880 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008881 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 if (maxchar_new == 0)
8883 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8884 else if (maxchar_new <= 127)
8885 maxchar_new = 127;
8886 else if (maxchar_new <= 255)
8887 maxchar_new = 255;
8888 else if (maxchar_new <= 65535)
8889 maxchar_new = 65535;
8890 else
8891 maxchar_new = 1114111; /* 0x10ffff */
8892
8893 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 /* fixfct should return TRUE if it modified the buffer. If
8895 FALSE, return a reference to the original buffer instead
8896 (to save space, not time) */
8897 Py_INCREF(self);
8898 Py_DECREF(u);
8899 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 else if (maxchar_new == maxchar_old) {
8902 return u;
8903 }
8904 else {
8905 /* In case the maximum character changed, we need to
8906 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008907 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (v == NULL) {
8909 Py_DECREF(u);
8910 return NULL;
8911 }
8912 if (maxchar_new > maxchar_old) {
8913 /* If the maxchar increased so that the kind changed, not all
8914 characters are representable anymore and we need to fix the
8915 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008916 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008917 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8919 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008920 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008921 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923
8924 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008925 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 return v;
8927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928}
8929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008931fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 /* No need to call PyUnicode_READY(self) because this function is only
8934 called as a callback from fixup() which does it already. */
8935 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8936 const int kind = PyUnicode_KIND(self);
8937 void *data = PyUnicode_DATA(self);
8938 int touched = 0;
8939 Py_UCS4 maxchar = 0;
8940 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 for (i = 0; i < len; ++i) {
8943 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8944 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8945 if (up != ch) {
8946 if (up > maxchar)
8947 maxchar = up;
8948 PyUnicode_WRITE(kind, data, i, up);
8949 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 else if (ch > maxchar)
8952 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 }
8954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 if (touched)
8956 return maxchar;
8957 else
8958 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959}
8960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008962fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8965 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8966 const int kind = PyUnicode_KIND(self);
8967 void *data = PyUnicode_DATA(self);
8968 int touched = 0;
8969 Py_UCS4 maxchar = 0;
8970 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 for(i = 0; i < len; ++i) {
8973 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8974 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8975 if (lo != ch) {
8976 if (lo > maxchar)
8977 maxchar = lo;
8978 PyUnicode_WRITE(kind, data, i, lo);
8979 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 else if (ch > maxchar)
8982 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 }
8984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 if (touched)
8986 return maxchar;
8987 else
8988 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989}
8990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008992fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8995 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8996 const int kind = PyUnicode_KIND(self);
8997 void *data = PyUnicode_DATA(self);
8998 int touched = 0;
8999 Py_UCS4 maxchar = 0;
9000 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 for(i = 0; i < len; ++i) {
9003 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9004 Py_UCS4 nu = 0;
9005
9006 if (Py_UNICODE_ISUPPER(ch))
9007 nu = Py_UNICODE_TOLOWER(ch);
9008 else if (Py_UNICODE_ISLOWER(ch))
9009 nu = Py_UNICODE_TOUPPER(ch);
9010
9011 if (nu != 0) {
9012 if (nu > maxchar)
9013 maxchar = nu;
9014 PyUnicode_WRITE(kind, data, i, nu);
9015 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 else if (ch > maxchar)
9018 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 }
9020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 if (touched)
9022 return maxchar;
9023 else
9024 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025}
9026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009028fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9031 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9032 const int kind = PyUnicode_KIND(self);
9033 void *data = PyUnicode_DATA(self);
9034 int touched = 0;
9035 Py_UCS4 maxchar = 0;
9036 Py_ssize_t i = 0;
9037 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009038
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009039 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041
9042 ch = PyUnicode_READ(kind, data, i);
9043 if (!Py_UNICODE_ISUPPER(ch)) {
9044 maxchar = Py_UNICODE_TOUPPER(ch);
9045 PyUnicode_WRITE(kind, data, i, maxchar);
9046 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 ++i;
9049 for(; i < len; ++i) {
9050 ch = PyUnicode_READ(kind, data, i);
9051 if (!Py_UNICODE_ISLOWER(ch)) {
9052 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9053 if (lo > maxchar)
9054 maxchar = lo;
9055 PyUnicode_WRITE(kind, data, i, lo);
9056 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 else if (ch > maxchar)
9059 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061
9062 if (touched)
9063 return maxchar;
9064 else
9065 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066}
9067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009069fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9072 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9073 const int kind = PyUnicode_KIND(self);
9074 void *data = PyUnicode_DATA(self);
9075 Py_UCS4 maxchar = 0;
9076 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077 int previous_is_cased;
9078
9079 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 if (len == 1) {
9081 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9082 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9083 if (ti != ch) {
9084 PyUnicode_WRITE(kind, data, i, ti);
9085 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 }
9087 else
9088 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 for(; i < len; ++i) {
9092 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9093 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009094
Benjamin Peterson29060642009-01-31 22:14:21 +00009095 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 nu = Py_UNICODE_TOTITLE(ch);
9099
9100 if (nu > maxchar)
9101 maxchar = nu;
9102 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009103
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 if (Py_UNICODE_ISLOWER(ch) ||
9105 Py_UNICODE_ISUPPER(ch) ||
9106 Py_UNICODE_ISTITLE(ch))
9107 previous_is_cased = 1;
9108 else
9109 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112}
9113
Tim Peters8ce9f162004-08-27 01:49:32 +00009114PyObject *
9115PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009118 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009120 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009121 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9122 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009123 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009125 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127
Tim Peters05eba1f2004-08-27 21:32:02 +00009128 fseq = PySequence_Fast(seq, "");
9129 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009130 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009131 }
9132
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009133 /* NOTE: the following code can't call back into Python code,
9134 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009135 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009136
Tim Peters05eba1f2004-08-27 21:32:02 +00009137 seqlen = PySequence_Fast_GET_SIZE(fseq);
9138 /* If empty sequence, return u"". */
9139 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009140 Py_DECREF(fseq);
9141 Py_INCREF(unicode_empty);
9142 res = unicode_empty;
9143 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009144 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009145
Tim Peters05eba1f2004-08-27 21:32:02 +00009146 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009147 items = PySequence_Fast_ITEMS(fseq);
9148 if (seqlen == 1 && PyUnicode_CheckExact(items[0])) {
9149 res = items[0];
9150 Py_INCREF(res);
9151 Py_DECREF(fseq);
9152 return res;
9153 }
9154
9155 /* Set up sep and seplen */
9156 if (separator == NULL) {
9157 /* fall back to a blank space separator */
9158 sep = PyUnicode_FromOrdinal(' ');
9159 if (!sep)
9160 goto onError;
9161 maxchar = 32;
Tim Peters8ce9f162004-08-27 01:49:32 +00009162 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009163 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009164 if (!PyUnicode_Check(separator)) {
9165 PyErr_Format(PyExc_TypeError,
9166 "separator: expected str instance,"
9167 " %.80s found",
9168 Py_TYPE(separator)->tp_name);
9169 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00009170 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009171 if (PyUnicode_READY(separator))
9172 goto onError;
9173 sep = separator;
9174 seplen = PyUnicode_GET_LENGTH(separator);
9175 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9176 /* inc refcount to keep this code path symmetric with the
9177 above case of a blank separator */
9178 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00009179 }
9180
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009181 /* There are at least two things to join, or else we have a subclass
9182 * of str in the sequence.
9183 * Do a pre-pass to figure out the total amount of space we'll
9184 * need (sz), and see whether all argument are strings.
9185 */
9186 sz = 0;
9187 for (i = 0; i < seqlen; i++) {
9188 const Py_ssize_t old_sz = sz;
9189 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (!PyUnicode_Check(item)) {
9191 PyErr_Format(PyExc_TypeError,
9192 "sequence item %zd: expected str instance,"
9193 " %.80s found",
9194 i, Py_TYPE(item)->tp_name);
9195 goto onError;
9196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 if (PyUnicode_READY(item) == -1)
9198 goto onError;
9199 sz += PyUnicode_GET_LENGTH(item);
9200 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9201 if (item_maxchar > maxchar)
9202 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009203 if (i != 0)
9204 sz += seplen;
9205 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9206 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009208 goto onError;
9209 }
9210 }
Tim Petersced69f82003-09-16 20:30:58 +00009211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009213 if (res == NULL)
9214 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009215
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009216 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009218 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009219 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009221 if (i && seplen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009222 copy_characters(res, res_offset, sep, 0, seplen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009225 itemlen = PyUnicode_GET_LENGTH(item);
9226 if (itemlen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009227 copy_characters(res, res_offset, item, 0, itemlen);
Victor Stinner9ce5a832011-10-03 23:36:02 +02009228 res_offset += itemlen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009229 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009232
Tim Peters05eba1f2004-08-27 21:32:02 +00009233 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009235 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009239 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009241 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242 return NULL;
9243}
9244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245#define FILL(kind, data, value, start, length) \
9246 do { \
9247 Py_ssize_t i_ = 0; \
9248 assert(kind != PyUnicode_WCHAR_KIND); \
9249 switch ((kind)) { \
9250 case PyUnicode_1BYTE_KIND: { \
9251 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9252 memset(to_, (unsigned char)value, length); \
9253 break; \
9254 } \
9255 case PyUnicode_2BYTE_KIND: { \
9256 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9257 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9258 break; \
9259 } \
9260 default: { \
9261 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9262 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9263 break; \
9264 } \
9265 } \
9266 } while (0)
9267
Victor Stinner9310abb2011-10-05 00:59:23 +02009268static PyObject *
9269pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009270 Py_ssize_t left,
9271 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 PyObject *u;
9275 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009276 int kind;
9277 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278
9279 if (left < 0)
9280 left = 0;
9281 if (right < 0)
9282 right = 0;
9283
Tim Peters7a29bd52001-09-12 03:03:31 +00009284 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285 Py_INCREF(self);
9286 return self;
9287 }
9288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9290 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009291 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9292 return NULL;
9293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9295 if (fill > maxchar)
9296 maxchar = fill;
9297 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009298 if (!u)
9299 return NULL;
9300
9301 kind = PyUnicode_KIND(u);
9302 data = PyUnicode_DATA(u);
9303 if (left)
9304 FILL(kind, data, fill, 0, left);
9305 if (right)
9306 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009307 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009308 assert(_PyUnicode_CheckConsistency(u, 1));
9309 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312
Alexander Belopolsky40018472011-02-26 01:02:56 +00009313PyObject *
9314PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317
9318 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 switch(PyUnicode_KIND(string)) {
9323 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009324 if (PyUnicode_IS_ASCII(string))
9325 list = asciilib_splitlines(
9326 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9327 PyUnicode_GET_LENGTH(string), keepends);
9328 else
9329 list = ucs1lib_splitlines(
9330 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9331 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 break;
9333 case PyUnicode_2BYTE_KIND:
9334 list = ucs2lib_splitlines(
9335 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9336 PyUnicode_GET_LENGTH(string), keepends);
9337 break;
9338 case PyUnicode_4BYTE_KIND:
9339 list = ucs4lib_splitlines(
9340 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9341 PyUnicode_GET_LENGTH(string), keepends);
9342 break;
9343 default:
9344 assert(0);
9345 list = 0;
9346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347 Py_DECREF(string);
9348 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349}
9350
Alexander Belopolsky40018472011-02-26 01:02:56 +00009351static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009352split(PyObject *self,
9353 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009354 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 int kind1, kind2, kind;
9357 void *buf1, *buf2;
9358 Py_ssize_t len1, len2;
9359 PyObject* out;
9360
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009362 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 if (PyUnicode_READY(self) == -1)
9365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 if (substring == NULL)
9368 switch(PyUnicode_KIND(self)) {
9369 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009370 if (PyUnicode_IS_ASCII(self))
9371 return asciilib_split_whitespace(
9372 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9373 PyUnicode_GET_LENGTH(self), maxcount
9374 );
9375 else
9376 return ucs1lib_split_whitespace(
9377 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9378 PyUnicode_GET_LENGTH(self), maxcount
9379 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 case PyUnicode_2BYTE_KIND:
9381 return ucs2lib_split_whitespace(
9382 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9383 PyUnicode_GET_LENGTH(self), maxcount
9384 );
9385 case PyUnicode_4BYTE_KIND:
9386 return ucs4lib_split_whitespace(
9387 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9388 PyUnicode_GET_LENGTH(self), maxcount
9389 );
9390 default:
9391 assert(0);
9392 return NULL;
9393 }
9394
9395 if (PyUnicode_READY(substring) == -1)
9396 return NULL;
9397
9398 kind1 = PyUnicode_KIND(self);
9399 kind2 = PyUnicode_KIND(substring);
9400 kind = kind1 > kind2 ? kind1 : kind2;
9401 buf1 = PyUnicode_DATA(self);
9402 buf2 = PyUnicode_DATA(substring);
9403 if (kind1 != kind)
9404 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9405 if (!buf1)
9406 return NULL;
9407 if (kind2 != kind)
9408 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9409 if (!buf2) {
9410 if (kind1 != kind) PyMem_Free(buf1);
9411 return NULL;
9412 }
9413 len1 = PyUnicode_GET_LENGTH(self);
9414 len2 = PyUnicode_GET_LENGTH(substring);
9415
9416 switch(kind) {
9417 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009418 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9419 out = asciilib_split(
9420 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9421 else
9422 out = ucs1lib_split(
9423 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 break;
9425 case PyUnicode_2BYTE_KIND:
9426 out = ucs2lib_split(
9427 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9428 break;
9429 case PyUnicode_4BYTE_KIND:
9430 out = ucs4lib_split(
9431 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9432 break;
9433 default:
9434 out = NULL;
9435 }
9436 if (kind1 != kind)
9437 PyMem_Free(buf1);
9438 if (kind2 != kind)
9439 PyMem_Free(buf2);
9440 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441}
9442
Alexander Belopolsky40018472011-02-26 01:02:56 +00009443static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009444rsplit(PyObject *self,
9445 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009446 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 int kind1, kind2, kind;
9449 void *buf1, *buf2;
9450 Py_ssize_t len1, len2;
9451 PyObject* out;
9452
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009453 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009454 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 if (PyUnicode_READY(self) == -1)
9457 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 if (substring == NULL)
9460 switch(PyUnicode_KIND(self)) {
9461 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009462 if (PyUnicode_IS_ASCII(self))
9463 return asciilib_rsplit_whitespace(
9464 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9465 PyUnicode_GET_LENGTH(self), maxcount
9466 );
9467 else
9468 return ucs1lib_rsplit_whitespace(
9469 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9470 PyUnicode_GET_LENGTH(self), maxcount
9471 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 case PyUnicode_2BYTE_KIND:
9473 return ucs2lib_rsplit_whitespace(
9474 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9475 PyUnicode_GET_LENGTH(self), maxcount
9476 );
9477 case PyUnicode_4BYTE_KIND:
9478 return ucs4lib_rsplit_whitespace(
9479 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9480 PyUnicode_GET_LENGTH(self), maxcount
9481 );
9482 default:
9483 assert(0);
9484 return NULL;
9485 }
9486
9487 if (PyUnicode_READY(substring) == -1)
9488 return NULL;
9489
9490 kind1 = PyUnicode_KIND(self);
9491 kind2 = PyUnicode_KIND(substring);
9492 kind = kind1 > kind2 ? kind1 : kind2;
9493 buf1 = PyUnicode_DATA(self);
9494 buf2 = PyUnicode_DATA(substring);
9495 if (kind1 != kind)
9496 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9497 if (!buf1)
9498 return NULL;
9499 if (kind2 != kind)
9500 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9501 if (!buf2) {
9502 if (kind1 != kind) PyMem_Free(buf1);
9503 return NULL;
9504 }
9505 len1 = PyUnicode_GET_LENGTH(self);
9506 len2 = PyUnicode_GET_LENGTH(substring);
9507
9508 switch(kind) {
9509 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009510 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9511 out = asciilib_rsplit(
9512 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9513 else
9514 out = ucs1lib_rsplit(
9515 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 break;
9517 case PyUnicode_2BYTE_KIND:
9518 out = ucs2lib_rsplit(
9519 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9520 break;
9521 case PyUnicode_4BYTE_KIND:
9522 out = ucs4lib_rsplit(
9523 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9524 break;
9525 default:
9526 out = NULL;
9527 }
9528 if (kind1 != kind)
9529 PyMem_Free(buf1);
9530 if (kind2 != kind)
9531 PyMem_Free(buf2);
9532 return out;
9533}
9534
9535static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009536anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9537 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538{
9539 switch(kind) {
9540 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009541 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9542 return asciilib_find(buf1, len1, buf2, len2, offset);
9543 else
9544 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 case PyUnicode_2BYTE_KIND:
9546 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9547 case PyUnicode_4BYTE_KIND:
9548 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9549 }
9550 assert(0);
9551 return -1;
9552}
9553
9554static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009555anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9556 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557{
9558 switch(kind) {
9559 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009560 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9561 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9562 else
9563 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 case PyUnicode_2BYTE_KIND:
9565 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9566 case PyUnicode_4BYTE_KIND:
9567 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9568 }
9569 assert(0);
9570 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009571}
9572
Alexander Belopolsky40018472011-02-26 01:02:56 +00009573static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574replace(PyObject *self, PyObject *str1,
9575 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 PyObject *u;
9578 char *sbuf = PyUnicode_DATA(self);
9579 char *buf1 = PyUnicode_DATA(str1);
9580 char *buf2 = PyUnicode_DATA(str2);
9581 int srelease = 0, release1 = 0, release2 = 0;
9582 int skind = PyUnicode_KIND(self);
9583 int kind1 = PyUnicode_KIND(str1);
9584 int kind2 = PyUnicode_KIND(str2);
9585 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9586 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9587 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588
9589 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009590 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009592 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 if (skind < kind1)
9595 /* substring too wide to be present */
9596 goto nothing;
9597
9598 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009599 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009600 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009602 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009604 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 Py_UCS4 u1, u2, maxchar;
9606 int mayshrink, rkind;
9607 u1 = PyUnicode_READ_CHAR(str1, 0);
9608 if (!findchar(sbuf, PyUnicode_KIND(self),
9609 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009610 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 u2 = PyUnicode_READ_CHAR(str2, 0);
9612 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9613 /* Replacing u1 with u2 may cause a maxchar reduction in the
9614 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 if (u2 > maxchar) {
9616 maxchar = u2;
9617 mayshrink = 0;
9618 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009619 else
9620 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009622 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009624 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 rkind = PyUnicode_KIND(u);
9626 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9627 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009628 if (--maxcount < 0)
9629 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009633 unicode_adjust_maxchar(&u);
9634 if (u == NULL)
9635 goto error;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 int rkind = skind;
9639 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009640 PyObject *rstr;
9641 Py_UCS4 maxchar;
9642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 if (kind1 < rkind) {
9644 /* widen substring */
9645 buf1 = _PyUnicode_AsKind(str1, rkind);
9646 if (!buf1) goto error;
9647 release1 = 1;
9648 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009649 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009650 if (i < 0)
9651 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 if (rkind > kind2) {
9653 /* widen replacement */
9654 buf2 = _PyUnicode_AsKind(str2, rkind);
9655 if (!buf2) goto error;
9656 release2 = 1;
9657 }
9658 else if (rkind < kind2) {
9659 /* widen self and buf1 */
9660 rkind = kind2;
9661 if (release1) PyMem_Free(buf1);
9662 sbuf = _PyUnicode_AsKind(self, rkind);
9663 if (!sbuf) goto error;
9664 srelease = 1;
9665 buf1 = _PyUnicode_AsKind(str1, rkind);
9666 if (!buf1) goto error;
9667 release1 = 1;
9668 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009669 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9670 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9671 rstr = PyUnicode_New(slen, maxchar);
9672 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009674 res = PyUnicode_DATA(rstr);
9675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009677 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9679 buf2,
9680 PyUnicode_KIND_SIZE(rkind, len2));
9681 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009682
9683 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009684 i = anylib_find(rkind, self,
9685 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9686 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009687 if (i == -1)
9688 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9690 buf2,
9691 PyUnicode_KIND_SIZE(rkind, len2));
9692 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694
Victor Stinner25a4b292011-10-06 12:31:55 +02009695 u = rstr;
9696 unicode_adjust_maxchar(&u);
9697 if (!u)
9698 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 Py_ssize_t n, i, j, ires;
9703 Py_ssize_t product, new_size;
9704 int rkind = skind;
Victor Stinner25a4b292011-10-06 12:31:55 +02009705 PyObject *rstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009707 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 if (kind1 < rkind) {
9710 buf1 = _PyUnicode_AsKind(str1, rkind);
9711 if (!buf1) goto error;
9712 release1 = 1;
9713 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009714 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009715 if (n == 0)
9716 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 if (kind2 < rkind) {
9718 buf2 = _PyUnicode_AsKind(str2, rkind);
9719 if (!buf2) goto error;
9720 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 else if (kind2 > rkind) {
9723 rkind = kind2;
9724 sbuf = _PyUnicode_AsKind(self, rkind);
9725 if (!sbuf) goto error;
9726 srelease = 1;
9727 if (release1) PyMem_Free(buf1);
9728 buf1 = _PyUnicode_AsKind(str1, rkind);
9729 if (!buf1) goto error;
9730 release1 = 1;
9731 }
9732 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9733 PyUnicode_GET_LENGTH(str1))); */
9734 product = n * (len2-len1);
9735 if ((product / (len2-len1)) != n) {
9736 PyErr_SetString(PyExc_OverflowError,
9737 "replace string is too long");
9738 goto error;
9739 }
9740 new_size = slen + product;
9741 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9742 PyErr_SetString(PyExc_OverflowError,
9743 "replace string is too long");
9744 goto error;
9745 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009746 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9747 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9748 rstr = PyUnicode_New(new_size, maxchar);
9749 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009751 res = PyUnicode_DATA(rstr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 ires = i = 0;
9753 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009754 while (n-- > 0) {
9755 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009756 j = anylib_find(rkind, self,
9757 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9758 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009759 if (j == -1)
9760 break;
9761 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009762 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9764 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9765 PyUnicode_KIND_SIZE(rkind, j-i));
9766 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009767 }
9768 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 if (len2 > 0) {
9770 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9771 buf2,
9772 PyUnicode_KIND_SIZE(rkind, len2));
9773 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009778 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9780 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9781 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009782 } else {
9783 /* interleave */
9784 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9786 buf2,
9787 PyUnicode_KIND_SIZE(rkind, len2));
9788 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009789 if (--n <= 0)
9790 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9792 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9793 PyUnicode_KIND_SIZE(rkind, 1));
9794 ires++;
9795 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9798 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9799 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009800 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009801 u = rstr;
9802 unicode_adjust_maxchar(&u);
9803 if (u == NULL)
9804 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 if (srelease)
9807 PyMem_FREE(sbuf);
9808 if (release1)
9809 PyMem_FREE(buf1);
9810 if (release2)
9811 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009812 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009814
Benjamin Peterson29060642009-01-31 22:14:21 +00009815 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009816 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 if (srelease)
9818 PyMem_FREE(sbuf);
9819 if (release1)
9820 PyMem_FREE(buf1);
9821 if (release2)
9822 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009823 if (PyUnicode_CheckExact(self)) {
9824 Py_INCREF(self);
9825 return (PyObject *) self;
9826 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009827 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 error:
9829 if (srelease && sbuf)
9830 PyMem_FREE(sbuf);
9831 if (release1 && buf1)
9832 PyMem_FREE(buf1);
9833 if (release2 && buf2)
9834 PyMem_FREE(buf2);
9835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836}
9837
9838/* --- Unicode Object Methods --------------------------------------------- */
9839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009840PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009841 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842\n\
9843Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009844characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845
9846static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009847unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849 return fixup(self, fixtitle);
9850}
9851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009852PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854\n\
9855Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009856have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857
9858static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009859unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861 return fixup(self, fixcapitalize);
9862}
9863
9864#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009865PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009866 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867\n\
9868Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009869normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870
9871static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009872unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873{
9874 PyObject *list;
9875 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009876 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878 /* Split into words */
9879 list = split(self, NULL, -1);
9880 if (!list)
9881 return NULL;
9882
9883 /* Capitalize each word */
9884 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9885 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009886 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887 if (item == NULL)
9888 goto onError;
9889 Py_DECREF(PyList_GET_ITEM(list, i));
9890 PyList_SET_ITEM(list, i, item);
9891 }
9892
9893 /* Join the words to form a new string */
9894 item = PyUnicode_Join(NULL, list);
9895
Benjamin Peterson29060642009-01-31 22:14:21 +00009896 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897 Py_DECREF(list);
9898 return (PyObject *)item;
9899}
9900#endif
9901
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009902/* Argument converter. Coerces to a single unicode character */
9903
9904static int
9905convert_uc(PyObject *obj, void *addr)
9906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009908 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009909
Benjamin Peterson14339b62009-01-31 16:36:08 +00009910 uniobj = PyUnicode_FromObject(obj);
9911 if (uniobj == NULL) {
9912 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009914 return 0;
9915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009917 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009918 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 Py_DECREF(uniobj);
9920 return 0;
9921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009923 Py_DECREF(uniobj);
9924 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009925}
9926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009927PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009930Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009931done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932
9933static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009934unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009936 Py_ssize_t marg, left;
9937 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 Py_UCS4 fillchar = ' ';
9939
Victor Stinnere9a29352011-10-01 02:14:59 +02009940 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942
Victor Stinnere9a29352011-10-01 02:14:59 +02009943 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944 return NULL;
9945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947 Py_INCREF(self);
9948 return (PyObject*) self;
9949 }
9950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952 left = marg / 2 + (marg & width & 1);
9953
Victor Stinner9310abb2011-10-05 00:59:23 +02009954 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955}
9956
Marc-André Lemburge5034372000-08-08 08:04:29 +00009957#if 0
9958
9959/* This code should go into some future Unicode collation support
9960 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009961 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009962
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009963/* speedy UTF-16 code point order comparison */
9964/* gleaned from: */
9965/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9966
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009967static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009968{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009969 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009970 0, 0, 0, 0, 0, 0, 0, 0,
9971 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009972 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009973};
9974
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975static int
9976unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9977{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009978 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009979
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980 Py_UNICODE *s1 = str1->str;
9981 Py_UNICODE *s2 = str2->str;
9982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 len1 = str1->_base._base.length;
9984 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009985
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009987 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009988
9989 c1 = *s1++;
9990 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009991
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 if (c1 > (1<<11) * 26)
9993 c1 += utf16Fixup[c1>>11];
9994 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009995 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009996 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009997
9998 if (c1 != c2)
9999 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +000010000
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010001 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002 }
10003
10004 return (len1 < len2) ? -1 : (len1 != len2);
10005}
10006
Marc-André Lemburge5034372000-08-08 08:04:29 +000010007#else
10008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009/* This function assumes that str1 and str2 are readied by the caller. */
10010
Marc-André Lemburge5034372000-08-08 08:04:29 +000010011static int
10012unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 int kind1, kind2;
10015 void *data1, *data2;
10016 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 kind1 = PyUnicode_KIND(str1);
10019 kind2 = PyUnicode_KIND(str2);
10020 data1 = PyUnicode_DATA(str1);
10021 data2 = PyUnicode_DATA(str2);
10022 len1 = PyUnicode_GET_LENGTH(str1);
10023 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 for (i = 0; i < len1 && i < len2; ++i) {
10026 Py_UCS4 c1, c2;
10027 c1 = PyUnicode_READ(kind1, data1, i);
10028 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010029
10030 if (c1 != c2)
10031 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010032 }
10033
10034 return (len1 < len2) ? -1 : (len1 != len2);
10035}
10036
10037#endif
10038
Alexander Belopolsky40018472011-02-26 01:02:56 +000010039int
10040PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10043 if (PyUnicode_READY(left) == -1 ||
10044 PyUnicode_READY(right) == -1)
10045 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010046 return unicode_compare((PyUnicodeObject *)left,
10047 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010049 PyErr_Format(PyExc_TypeError,
10050 "Can't compare %.100s and %.100s",
10051 left->ob_type->tp_name,
10052 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053 return -1;
10054}
10055
Martin v. Löwis5b222132007-06-10 09:51:05 +000010056int
10057PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 Py_ssize_t i;
10060 int kind;
10061 void *data;
10062 Py_UCS4 chr;
10063
Victor Stinner910337b2011-10-03 03:20:16 +020010064 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 if (PyUnicode_READY(uni) == -1)
10066 return -1;
10067 kind = PyUnicode_KIND(uni);
10068 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010069 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10071 if (chr != str[i])
10072 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010073 /* This check keeps Python strings that end in '\0' from comparing equal
10074 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010077 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010079 return 0;
10080}
10081
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010082
Benjamin Peterson29060642009-01-31 22:14:21 +000010083#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010084 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010085
Alexander Belopolsky40018472011-02-26 01:02:56 +000010086PyObject *
10087PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010088{
10089 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010090
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010091 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10092 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 if (PyUnicode_READY(left) == -1 ||
10094 PyUnicode_READY(right) == -1)
10095 return NULL;
10096 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10097 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010098 if (op == Py_EQ) {
10099 Py_INCREF(Py_False);
10100 return Py_False;
10101 }
10102 if (op == Py_NE) {
10103 Py_INCREF(Py_True);
10104 return Py_True;
10105 }
10106 }
10107 if (left == right)
10108 result = 0;
10109 else
10110 result = unicode_compare((PyUnicodeObject *)left,
10111 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010112
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010113 /* Convert the return value to a Boolean */
10114 switch (op) {
10115 case Py_EQ:
10116 v = TEST_COND(result == 0);
10117 break;
10118 case Py_NE:
10119 v = TEST_COND(result != 0);
10120 break;
10121 case Py_LE:
10122 v = TEST_COND(result <= 0);
10123 break;
10124 case Py_GE:
10125 v = TEST_COND(result >= 0);
10126 break;
10127 case Py_LT:
10128 v = TEST_COND(result == -1);
10129 break;
10130 case Py_GT:
10131 v = TEST_COND(result == 1);
10132 break;
10133 default:
10134 PyErr_BadArgument();
10135 return NULL;
10136 }
10137 Py_INCREF(v);
10138 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010140
Brian Curtindfc80e32011-08-10 20:28:54 -050010141 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010142}
10143
Alexander Belopolsky40018472011-02-26 01:02:56 +000010144int
10145PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010146{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010147 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 int kind1, kind2, kind;
10149 void *buf1, *buf2;
10150 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010151 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010152
10153 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010154 sub = PyUnicode_FromObject(element);
10155 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 PyErr_Format(PyExc_TypeError,
10157 "'in <string>' requires string as left operand, not %s",
10158 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 if (PyUnicode_READY(sub) == -1)
10162 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010163
Thomas Wouters477c8d52006-05-27 19:21:47 +000010164 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010165 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010166 Py_DECREF(sub);
10167 return -1;
10168 }
10169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 kind1 = PyUnicode_KIND(str);
10171 kind2 = PyUnicode_KIND(sub);
10172 kind = kind1 > kind2 ? kind1 : kind2;
10173 buf1 = PyUnicode_DATA(str);
10174 buf2 = PyUnicode_DATA(sub);
10175 if (kind1 != kind)
10176 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10177 if (!buf1) {
10178 Py_DECREF(sub);
10179 return -1;
10180 }
10181 if (kind2 != kind)
10182 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10183 if (!buf2) {
10184 Py_DECREF(sub);
10185 if (kind1 != kind) PyMem_Free(buf1);
10186 return -1;
10187 }
10188 len1 = PyUnicode_GET_LENGTH(str);
10189 len2 = PyUnicode_GET_LENGTH(sub);
10190
10191 switch(kind) {
10192 case PyUnicode_1BYTE_KIND:
10193 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10194 break;
10195 case PyUnicode_2BYTE_KIND:
10196 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10197 break;
10198 case PyUnicode_4BYTE_KIND:
10199 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10200 break;
10201 default:
10202 result = -1;
10203 assert(0);
10204 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010205
10206 Py_DECREF(str);
10207 Py_DECREF(sub);
10208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 if (kind1 != kind)
10210 PyMem_Free(buf1);
10211 if (kind2 != kind)
10212 PyMem_Free(buf2);
10213
Guido van Rossum403d68b2000-03-13 15:55:09 +000010214 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010215}
10216
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217/* Concat to string or Unicode object giving a new Unicode object. */
10218
Alexander Belopolsky40018472011-02-26 01:02:56 +000010219PyObject *
10220PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 PyObject *u = NULL, *v = NULL, *w;
10223 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
10225 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010228 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
10233 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010234 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010235 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010238 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010239 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 }
10242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010244 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 w = PyUnicode_New(
10248 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10249 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010251 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010252 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10253 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254 Py_DECREF(u);
10255 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010256 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258
Benjamin Peterson29060642009-01-31 22:14:21 +000010259 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260 Py_XDECREF(u);
10261 Py_XDECREF(v);
10262 return NULL;
10263}
10264
Victor Stinnerb0923652011-10-04 01:17:31 +020010265static void
10266unicode_append_inplace(PyObject **p_left, PyObject *right)
10267{
10268 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010269
10270 assert(PyUnicode_IS_READY(*p_left));
10271 assert(PyUnicode_IS_READY(right));
10272
10273 left_len = PyUnicode_GET_LENGTH(*p_left);
10274 right_len = PyUnicode_GET_LENGTH(right);
10275 if (left_len > PY_SSIZE_T_MAX - right_len) {
10276 PyErr_SetString(PyExc_OverflowError,
10277 "strings are too large to concat");
10278 goto error;
10279 }
10280 new_len = left_len + right_len;
10281
10282 /* Now we own the last reference to 'left', so we can resize it
10283 * in-place.
10284 */
10285 if (unicode_resize(p_left, new_len) != 0) {
10286 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10287 * deallocated so it cannot be put back into
10288 * 'variable'. The MemoryError is raised when there
10289 * is no value in 'variable', which might (very
10290 * remotely) be a cause of incompatibilities.
10291 */
10292 goto error;
10293 }
10294 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010295 copy_characters(*p_left, left_len, right, 0, right_len);
10296 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010297 return;
10298
10299error:
10300 Py_DECREF(*p_left);
10301 *p_left = NULL;
10302}
10303
Walter Dörwald1ab83302007-05-18 17:15:44 +000010304void
Victor Stinner23e56682011-10-03 03:54:37 +020010305PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010306{
Victor Stinner23e56682011-10-03 03:54:37 +020010307 PyObject *left, *res;
10308
10309 if (p_left == NULL) {
10310 if (!PyErr_Occurred())
10311 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010312 return;
10313 }
Victor Stinner23e56682011-10-03 03:54:37 +020010314 left = *p_left;
10315 if (right == NULL || !PyUnicode_Check(left)) {
10316 if (!PyErr_Occurred())
10317 PyErr_BadInternalCall();
10318 goto error;
10319 }
10320
Victor Stinnere1335c72011-10-04 20:53:03 +020010321 if (PyUnicode_READY(left))
10322 goto error;
10323 if (PyUnicode_READY(right))
10324 goto error;
10325
Victor Stinner23e56682011-10-03 03:54:37 +020010326 if (PyUnicode_CheckExact(left) && left != unicode_empty
10327 && PyUnicode_CheckExact(right) && right != unicode_empty
10328 && unicode_resizable(left)
10329 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10330 || _PyUnicode_WSTR(left) != NULL))
10331 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010332 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10333 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010334 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010335 not so different than duplicating the string. */
10336 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010337 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010338 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010339 if (p_left != NULL)
10340 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010341 return;
10342 }
10343 }
10344
10345 res = PyUnicode_Concat(left, right);
10346 if (res == NULL)
10347 goto error;
10348 Py_DECREF(left);
10349 *p_left = res;
10350 return;
10351
10352error:
10353 Py_DECREF(*p_left);
10354 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010355}
10356
10357void
10358PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10359{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010360 PyUnicode_Append(pleft, right);
10361 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010362}
10363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010364PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010365 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010367Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010368string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010369interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
10371static PyObject *
10372unicode_count(PyUnicodeObject *self, PyObject *args)
10373{
10374 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010375 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010376 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 int kind1, kind2, kind;
10379 void *buf1, *buf2;
10380 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381
Jesus Ceaac451502011-04-20 17:09:23 +020010382 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10383 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 kind1 = PyUnicode_KIND(self);
10387 kind2 = PyUnicode_KIND(substring);
10388 kind = kind1 > kind2 ? kind1 : kind2;
10389 buf1 = PyUnicode_DATA(self);
10390 buf2 = PyUnicode_DATA(substring);
10391 if (kind1 != kind)
10392 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10393 if (!buf1) {
10394 Py_DECREF(substring);
10395 return NULL;
10396 }
10397 if (kind2 != kind)
10398 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10399 if (!buf2) {
10400 Py_DECREF(substring);
10401 if (kind1 != kind) PyMem_Free(buf1);
10402 return NULL;
10403 }
10404 len1 = PyUnicode_GET_LENGTH(self);
10405 len2 = PyUnicode_GET_LENGTH(substring);
10406
10407 ADJUST_INDICES(start, end, len1);
10408 switch(kind) {
10409 case PyUnicode_1BYTE_KIND:
10410 iresult = ucs1lib_count(
10411 ((Py_UCS1*)buf1) + start, end - start,
10412 buf2, len2, PY_SSIZE_T_MAX
10413 );
10414 break;
10415 case PyUnicode_2BYTE_KIND:
10416 iresult = ucs2lib_count(
10417 ((Py_UCS2*)buf1) + start, end - start,
10418 buf2, len2, PY_SSIZE_T_MAX
10419 );
10420 break;
10421 case PyUnicode_4BYTE_KIND:
10422 iresult = ucs4lib_count(
10423 ((Py_UCS4*)buf1) + start, end - start,
10424 buf2, len2, PY_SSIZE_T_MAX
10425 );
10426 break;
10427 default:
10428 assert(0); iresult = 0;
10429 }
10430
10431 result = PyLong_FromSsize_t(iresult);
10432
10433 if (kind1 != kind)
10434 PyMem_Free(buf1);
10435 if (kind2 != kind)
10436 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437
10438 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010439
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440 return result;
10441}
10442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010443PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010444 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010446Encode S using the codec registered for encoding. Default encoding\n\
10447is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010448handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010449a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10450'xmlcharrefreplace' as well as any other name registered with\n\
10451codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452
10453static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010454unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010456 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457 char *encoding = NULL;
10458 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010459
Benjamin Peterson308d6372009-09-18 21:42:35 +000010460 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10461 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010463 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010464}
10465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010466PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468\n\
10469Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010470If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471
10472static PyObject*
10473unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10474{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010475 Py_ssize_t i, j, line_pos, src_len, incr;
10476 Py_UCS4 ch;
10477 PyObject *u;
10478 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010480 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010481 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482
10483 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485
Antoine Pitrou22425222011-10-04 19:10:51 +020010486 if (PyUnicode_READY(self) == -1)
10487 return NULL;
10488
Thomas Wouters7e474022000-07-16 12:04:32 +000010489 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010490 src_len = PyUnicode_GET_LENGTH(self);
10491 i = j = line_pos = 0;
10492 kind = PyUnicode_KIND(self);
10493 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010494 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010495 for (; i < src_len; i++) {
10496 ch = PyUnicode_READ(kind, src_data, i);
10497 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010498 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010500 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010501 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010502 goto overflow;
10503 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010504 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010505 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010508 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010509 goto overflow;
10510 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010512 if (ch == '\n' || ch == '\r')
10513 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010515 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010516 if (!found && PyUnicode_CheckExact(self)) {
10517 Py_INCREF((PyObject *) self);
10518 return (PyObject *) self;
10519 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010520
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010522 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523 if (!u)
10524 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010525 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526
Antoine Pitroue71d5742011-10-04 15:55:09 +020010527 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528
Antoine Pitroue71d5742011-10-04 15:55:09 +020010529 for (; i < src_len; i++) {
10530 ch = PyUnicode_READ(kind, src_data, i);
10531 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010533 incr = tabsize - (line_pos % tabsize);
10534 line_pos += incr;
10535 while (incr--) {
10536 PyUnicode_WRITE(kind, dest_data, j, ' ');
10537 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010538 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010540 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010541 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010542 line_pos++;
10543 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010544 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010545 if (ch == '\n' || ch == '\r')
10546 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010548 }
10549 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010550#ifndef DONT_MAKE_RESULT_READY
10551 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 Py_DECREF(u);
10553 return NULL;
10554 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010555#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010556 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010558
Antoine Pitroue71d5742011-10-04 15:55:09 +020010559 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010560 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562}
10563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010564PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010565 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566\n\
10567Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010568such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569arguments start and end are interpreted as in slice notation.\n\
10570\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010571Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
10573static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575{
Jesus Ceaac451502011-04-20 17:09:23 +020010576 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010577 Py_ssize_t start;
10578 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010579 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580
Jesus Ceaac451502011-04-20 17:09:23 +020010581 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10582 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (PyUnicode_READY(self) == -1)
10586 return NULL;
10587 if (PyUnicode_READY(substring) == -1)
10588 return NULL;
10589
10590 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 asciilib_find_slice, ucs1lib_find_slice,
10592 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010594 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
10596 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 if (result == -2)
10599 return NULL;
10600
Christian Heimes217cfd12007-12-02 14:31:20 +000010601 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602}
10603
10604static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010605unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010607 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10608 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611}
10612
Guido van Rossumc2504932007-09-18 19:42:40 +000010613/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010614 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010615static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010616unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617{
Guido van Rossumc2504932007-09-18 19:42:40 +000010618 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010619 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 if (_PyUnicode_HASH(self) != -1)
10622 return _PyUnicode_HASH(self);
10623 if (PyUnicode_READY(self) == -1)
10624 return -1;
10625 len = PyUnicode_GET_LENGTH(self);
10626
10627 /* The hash function as a macro, gets expanded three times below. */
10628#define HASH(P) \
10629 x = (Py_uhash_t)*P << 7; \
10630 while (--len >= 0) \
10631 x = (1000003*x) ^ (Py_uhash_t)*P++;
10632
10633 switch (PyUnicode_KIND(self)) {
10634 case PyUnicode_1BYTE_KIND: {
10635 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10636 HASH(c);
10637 break;
10638 }
10639 case PyUnicode_2BYTE_KIND: {
10640 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10641 HASH(s);
10642 break;
10643 }
10644 default: {
10645 Py_UCS4 *l;
10646 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10647 "Impossible switch case in unicode_hash");
10648 l = PyUnicode_4BYTE_DATA(self);
10649 HASH(l);
10650 break;
10651 }
10652 }
10653 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10654
Guido van Rossumc2504932007-09-18 19:42:40 +000010655 if (x == -1)
10656 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010658 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010662PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010663 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010665Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666
10667static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010670 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010671 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010672 Py_ssize_t start;
10673 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674
Jesus Ceaac451502011-04-20 17:09:23 +020010675 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10676 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (PyUnicode_READY(self) == -1)
10680 return NULL;
10681 if (PyUnicode_READY(substring) == -1)
10682 return NULL;
10683
10684 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010685 asciilib_find_slice, ucs1lib_find_slice,
10686 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689
10690 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (result == -2)
10693 return NULL;
10694
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 if (result < 0) {
10696 PyErr_SetString(PyExc_ValueError, "substring not found");
10697 return NULL;
10698 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699
Christian Heimes217cfd12007-12-02 14:31:20 +000010700 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701}
10702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010703PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010704 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010706Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010707at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
10709static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010710unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 Py_ssize_t i, length;
10713 int kind;
10714 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 int cased;
10716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (PyUnicode_READY(self) == -1)
10718 return NULL;
10719 length = PyUnicode_GET_LENGTH(self);
10720 kind = PyUnicode_KIND(self);
10721 data = PyUnicode_DATA(self);
10722
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 if (length == 1)
10725 return PyBool_FromLong(
10726 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010728 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010731
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 for (i = 0; i < length; i++) {
10734 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010735
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10737 return PyBool_FromLong(0);
10738 else if (!cased && Py_UNICODE_ISLOWER(ch))
10739 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010741 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742}
10743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010744PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010747Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010748at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
10750static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010751unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 Py_ssize_t i, length;
10754 int kind;
10755 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 int cased;
10757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
10760 length = PyUnicode_GET_LENGTH(self);
10761 kind = PyUnicode_KIND(self);
10762 data = PyUnicode_DATA(self);
10763
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 if (length == 1)
10766 return PyBool_FromLong(
10767 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010769 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010771 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010772
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 for (i = 0; i < length; i++) {
10775 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010776
Benjamin Peterson29060642009-01-31 22:14:21 +000010777 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10778 return PyBool_FromLong(0);
10779 else if (!cased && Py_UNICODE_ISUPPER(ch))
10780 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010782 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783}
10784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010785PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010786 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010788Return True if S is a titlecased string and there is at least one\n\
10789character in S, i.e. upper- and titlecase characters may only\n\
10790follow uncased characters and lowercase characters only cased ones.\n\
10791Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792
10793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010794unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 Py_ssize_t i, length;
10797 int kind;
10798 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 int cased, previous_is_cased;
10800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (PyUnicode_READY(self) == -1)
10802 return NULL;
10803 length = PyUnicode_GET_LENGTH(self);
10804 kind = PyUnicode_KIND(self);
10805 data = PyUnicode_DATA(self);
10806
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if (length == 1) {
10809 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10810 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10811 (Py_UNICODE_ISUPPER(ch) != 0));
10812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010814 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010816 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010817
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818 cased = 0;
10819 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 for (i = 0; i < length; i++) {
10821 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010822
Benjamin Peterson29060642009-01-31 22:14:21 +000010823 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10824 if (previous_is_cased)
10825 return PyBool_FromLong(0);
10826 previous_is_cased = 1;
10827 cased = 1;
10828 }
10829 else if (Py_UNICODE_ISLOWER(ch)) {
10830 if (!previous_is_cased)
10831 return PyBool_FromLong(0);
10832 previous_is_cased = 1;
10833 cased = 1;
10834 }
10835 else
10836 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010838 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839}
10840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010841PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010844Return True if all characters in S are whitespace\n\
10845and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
10847static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010848unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 Py_ssize_t i, length;
10851 int kind;
10852 void *data;
10853
10854 if (PyUnicode_READY(self) == -1)
10855 return NULL;
10856 length = PyUnicode_GET_LENGTH(self);
10857 kind = PyUnicode_KIND(self);
10858 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 if (length == 1)
10862 return PyBool_FromLong(
10863 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010865 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 for (i = 0; i < length; i++) {
10870 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010871 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010874 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875}
10876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010877PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010879\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010880Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010881and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010882
10883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010884unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 Py_ssize_t i, length;
10887 int kind;
10888 void *data;
10889
10890 if (PyUnicode_READY(self) == -1)
10891 return NULL;
10892 length = PyUnicode_GET_LENGTH(self);
10893 kind = PyUnicode_KIND(self);
10894 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010895
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010896 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 if (length == 1)
10898 return PyBool_FromLong(
10899 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010900
10901 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010903 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 for (i = 0; i < length; i++) {
10906 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010909 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010910}
10911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010912PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010914\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010915Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010916and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010917
10918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010919unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 int kind;
10922 void *data;
10923 Py_ssize_t len, i;
10924
10925 if (PyUnicode_READY(self) == -1)
10926 return NULL;
10927
10928 kind = PyUnicode_KIND(self);
10929 data = PyUnicode_DATA(self);
10930 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010931
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010932 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (len == 1) {
10934 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10935 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10936 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010937
10938 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 for (i = 0; i < len; i++) {
10943 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010944 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010946 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010947 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010948}
10949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010950PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010951 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010953Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010954False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
10956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010957unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 Py_ssize_t i, length;
10960 int kind;
10961 void *data;
10962
10963 if (PyUnicode_READY(self) == -1)
10964 return NULL;
10965 length = PyUnicode_GET_LENGTH(self);
10966 kind = PyUnicode_KIND(self);
10967 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 if (length == 1)
10971 return PyBool_FromLong(
10972 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010974 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 for (i = 0; i < length; i++) {
10979 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010980 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010982 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983}
10984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010985PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010988Return True if all characters in S are digits\n\
10989and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990
10991static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010992unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 Py_ssize_t i, length;
10995 int kind;
10996 void *data;
10997
10998 if (PyUnicode_READY(self) == -1)
10999 return NULL;
11000 length = PyUnicode_GET_LENGTH(self);
11001 kind = PyUnicode_KIND(self);
11002 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 if (length == 1) {
11006 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11007 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011010 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 for (i = 0; i < length; i++) {
11015 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011018 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019}
11020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011021PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011024Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011025False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026
11027static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011028unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 Py_ssize_t i, length;
11031 int kind;
11032 void *data;
11033
11034 if (PyUnicode_READY(self) == -1)
11035 return NULL;
11036 length = PyUnicode_GET_LENGTH(self);
11037 kind = PyUnicode_KIND(self);
11038 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (length == 1)
11042 return PyBool_FromLong(
11043 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011045 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011047 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 for (i = 0; i < length; i++) {
11050 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011051 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011053 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054}
11055
Martin v. Löwis47383402007-08-15 07:32:56 +000011056int
11057PyUnicode_IsIdentifier(PyObject *self)
11058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 int kind;
11060 void *data;
11061 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011062 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 if (PyUnicode_READY(self) == -1) {
11065 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 }
11068
11069 /* Special case for empty strings */
11070 if (PyUnicode_GET_LENGTH(self) == 0)
11071 return 0;
11072 kind = PyUnicode_KIND(self);
11073 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011074
11075 /* PEP 3131 says that the first character must be in
11076 XID_Start and subsequent characters in XID_Continue,
11077 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011078 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011079 letters, digits, underscore). However, given the current
11080 definition of XID_Start and XID_Continue, it is sufficient
11081 to check just for these, except that _ must be allowed
11082 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011084 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011085 return 0;
11086
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011087 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011089 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011090 return 1;
11091}
11092
11093PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011094 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011095\n\
11096Return True if S is a valid identifier according\n\
11097to the language definition.");
11098
11099static PyObject*
11100unicode_isidentifier(PyObject *self)
11101{
11102 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11103}
11104
Georg Brandl559e5d72008-06-11 18:37:52 +000011105PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011106 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011107\n\
11108Return True if all characters in S are considered\n\
11109printable in repr() or S is empty, False otherwise.");
11110
11111static PyObject*
11112unicode_isprintable(PyObject *self)
11113{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 Py_ssize_t i, length;
11115 int kind;
11116 void *data;
11117
11118 if (PyUnicode_READY(self) == -1)
11119 return NULL;
11120 length = PyUnicode_GET_LENGTH(self);
11121 kind = PyUnicode_KIND(self);
11122 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011123
11124 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 if (length == 1)
11126 return PyBool_FromLong(
11127 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 for (i = 0; i < length; i++) {
11130 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011131 Py_RETURN_FALSE;
11132 }
11133 }
11134 Py_RETURN_TRUE;
11135}
11136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011137PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011138 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139\n\
11140Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011141iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142
11143static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011144unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011146 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147}
11148
Martin v. Löwis18e16552006-02-15 17:27:45 +000011149static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150unicode_length(PyUnicodeObject *self)
11151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 if (PyUnicode_READY(self) == -1)
11153 return -1;
11154 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155}
11156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011157PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011160Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011161done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162
11163static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011164unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011166 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 Py_UCS4 fillchar = ' ';
11168
11169 if (PyUnicode_READY(self) == -1)
11170 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011171
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011172 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173 return NULL;
11174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176 Py_INCREF(self);
11177 return (PyObject*) self;
11178 }
11179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181}
11182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011183PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011186Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
11188static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011189unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 return fixup(self, fixlower);
11192}
11193
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011194#define LEFTSTRIP 0
11195#define RIGHTSTRIP 1
11196#define BOTHSTRIP 2
11197
11198/* Arrays indexed by above */
11199static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11200
11201#define STRIPNAME(i) (stripformat[i]+3)
11202
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011203/* externally visible for str.strip(unicode) */
11204PyObject *
11205_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 void *data;
11208 int kind;
11209 Py_ssize_t i, j, len;
11210 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11213 return NULL;
11214
11215 kind = PyUnicode_KIND(self);
11216 data = PyUnicode_DATA(self);
11217 len = PyUnicode_GET_LENGTH(self);
11218 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11219 PyUnicode_DATA(sepobj),
11220 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011221
Benjamin Peterson14339b62009-01-31 16:36:08 +000011222 i = 0;
11223 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 while (i < len &&
11225 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 i++;
11227 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011228 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011229
Benjamin Peterson14339b62009-01-31 16:36:08 +000011230 j = len;
11231 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 do {
11233 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 } while (j >= i &&
11235 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011237 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011238
Victor Stinner12bab6d2011-10-01 01:53:49 +020011239 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240}
11241
11242PyObject*
11243PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11244{
11245 unsigned char *data;
11246 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011247 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248
Victor Stinnerde636f32011-10-01 03:55:54 +020011249 if (PyUnicode_READY(self) == -1)
11250 return NULL;
11251
11252 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11253
Victor Stinner12bab6d2011-10-01 01:53:49 +020011254 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011256 if (PyUnicode_CheckExact(self)) {
11257 Py_INCREF(self);
11258 return self;
11259 }
11260 else
11261 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 }
11263
Victor Stinner12bab6d2011-10-01 01:53:49 +020011264 length = end - start;
11265 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011266 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267
Victor Stinnerde636f32011-10-01 03:55:54 +020011268 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011269 PyErr_SetString(PyExc_IndexError, "string index out of range");
11270 return NULL;
11271 }
11272
Victor Stinnerb9275c12011-10-05 14:01:42 +020011273 if (PyUnicode_IS_ASCII(self)) {
11274 kind = PyUnicode_KIND(self);
11275 data = PyUnicode_1BYTE_DATA(self);
11276 return unicode_fromascii(data + start, length);
11277 }
11278 else {
11279 kind = PyUnicode_KIND(self);
11280 data = PyUnicode_1BYTE_DATA(self);
11281 return PyUnicode_FromKindAndData(kind,
11282 data + PyUnicode_KIND_SIZE(kind, start),
11283 length);
11284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
11287static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011288do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 int kind;
11291 void *data;
11292 Py_ssize_t len, i, j;
11293
11294 if (PyUnicode_READY(self) == -1)
11295 return NULL;
11296
11297 kind = PyUnicode_KIND(self);
11298 data = PyUnicode_DATA(self);
11299 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011300
Benjamin Peterson14339b62009-01-31 16:36:08 +000011301 i = 0;
11302 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011304 i++;
11305 }
11306 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011307
Benjamin Peterson14339b62009-01-31 16:36:08 +000011308 j = len;
11309 if (striptype != LEFTSTRIP) {
11310 do {
11311 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011313 j++;
11314 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011315
Victor Stinner12bab6d2011-10-01 01:53:49 +020011316 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317}
11318
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011319
11320static PyObject *
11321do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011323 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011324
Benjamin Peterson14339b62009-01-31 16:36:08 +000011325 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11326 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011327
Benjamin Peterson14339b62009-01-31 16:36:08 +000011328 if (sep != NULL && sep != Py_None) {
11329 if (PyUnicode_Check(sep))
11330 return _PyUnicode_XStrip(self, striptype, sep);
11331 else {
11332 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 "%s arg must be None or str",
11334 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011335 return NULL;
11336 }
11337 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011338
Benjamin Peterson14339b62009-01-31 16:36:08 +000011339 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011340}
11341
11342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011343PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011345\n\
11346Return a copy of the string S with leading and trailing\n\
11347whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011348If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011349
11350static PyObject *
11351unicode_strip(PyUnicodeObject *self, PyObject *args)
11352{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011353 if (PyTuple_GET_SIZE(args) == 0)
11354 return do_strip(self, BOTHSTRIP); /* Common case */
11355 else
11356 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011357}
11358
11359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011360PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011362\n\
11363Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011364If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011365
11366static PyObject *
11367unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11368{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011369 if (PyTuple_GET_SIZE(args) == 0)
11370 return do_strip(self, LEFTSTRIP); /* Common case */
11371 else
11372 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011373}
11374
11375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011376PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011378\n\
11379Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011380If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011381
11382static PyObject *
11383unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11384{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011385 if (PyTuple_GET_SIZE(args) == 0)
11386 return do_strip(self, RIGHTSTRIP); /* Common case */
11387 else
11388 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011389}
11390
11391
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011393unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394{
11395 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397
Georg Brandl222de0f2009-04-12 12:01:50 +000011398 if (len < 1) {
11399 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011400 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402
Tim Peters7a29bd52001-09-12 03:03:31 +000011403 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404 /* no repeat, return original string */
11405 Py_INCREF(str);
11406 return (PyObject*) str;
11407 }
Tim Peters8f422462000-09-09 06:13:41 +000011408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 if (PyUnicode_READY(str) == -1)
11410 return NULL;
11411
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011412 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011413 PyErr_SetString(PyExc_OverflowError,
11414 "repeated string is too long");
11415 return NULL;
11416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 if (!u)
11421 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011422 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (PyUnicode_GET_LENGTH(str) == 1) {
11425 const int kind = PyUnicode_KIND(str);
11426 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11427 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011428 if (kind == PyUnicode_1BYTE_KIND)
11429 memset(to, (unsigned char)fill_char, len);
11430 else {
11431 for (n = 0; n < len; ++n)
11432 PyUnicode_WRITE(kind, to, n, fill_char);
11433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 }
11435 else {
11436 /* number of characters copied this far */
11437 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11438 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11439 char *to = (char *) PyUnicode_DATA(u);
11440 Py_MEMCPY(to, PyUnicode_DATA(str),
11441 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011442 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 n = (done <= nchars-done) ? done : nchars-done;
11444 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011445 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 }
11448
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011449 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 return (PyObject*) u;
11451}
11452
Alexander Belopolsky40018472011-02-26 01:02:56 +000011453PyObject *
11454PyUnicode_Replace(PyObject *obj,
11455 PyObject *subobj,
11456 PyObject *replobj,
11457 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458{
11459 PyObject *self;
11460 PyObject *str1;
11461 PyObject *str2;
11462 PyObject *result;
11463
11464 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011465 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011468 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 Py_DECREF(self);
11470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 }
11472 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011473 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 Py_DECREF(self);
11475 Py_DECREF(str1);
11476 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 Py_DECREF(self);
11480 Py_DECREF(str1);
11481 Py_DECREF(str2);
11482 return result;
11483}
11484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011485PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011486 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487\n\
11488Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011489old replaced by new. If the optional argument count is\n\
11490given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
11492static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 PyObject *str1;
11496 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011497 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498 PyObject *result;
11499
Martin v. Löwis18e16552006-02-15 17:27:45 +000011500 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 str1 = PyUnicode_FromObject(str1);
11505 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11506 return NULL;
11507 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011508 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 Py_DECREF(str1);
11510 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512
11513 result = replace(self, str1, str2, maxcount);
11514
11515 Py_DECREF(str1);
11516 Py_DECREF(str2);
11517 return result;
11518}
11519
Alexander Belopolsky40018472011-02-26 01:02:56 +000011520static PyObject *
11521unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011523 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 Py_ssize_t isize;
11525 Py_ssize_t osize, squote, dquote, i, o;
11526 Py_UCS4 max, quote;
11527 int ikind, okind;
11528 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011531 return NULL;
11532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 isize = PyUnicode_GET_LENGTH(unicode);
11534 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 /* Compute length of output, quote characters, and
11537 maximum character */
11538 osize = 2; /* quotes */
11539 max = 127;
11540 squote = dquote = 0;
11541 ikind = PyUnicode_KIND(unicode);
11542 for (i = 0; i < isize; i++) {
11543 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11544 switch (ch) {
11545 case '\'': squote++; osize++; break;
11546 case '"': dquote++; osize++; break;
11547 case '\\': case '\t': case '\r': case '\n':
11548 osize += 2; break;
11549 default:
11550 /* Fast-path ASCII */
11551 if (ch < ' ' || ch == 0x7f)
11552 osize += 4; /* \xHH */
11553 else if (ch < 0x7f)
11554 osize++;
11555 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11556 osize++;
11557 max = ch > max ? ch : max;
11558 }
11559 else if (ch < 0x100)
11560 osize += 4; /* \xHH */
11561 else if (ch < 0x10000)
11562 osize += 6; /* \uHHHH */
11563 else
11564 osize += 10; /* \uHHHHHHHH */
11565 }
11566 }
11567
11568 quote = '\'';
11569 if (squote) {
11570 if (dquote)
11571 /* Both squote and dquote present. Use squote,
11572 and escape them */
11573 osize += squote;
11574 else
11575 quote = '"';
11576 }
11577
11578 repr = PyUnicode_New(osize, max);
11579 if (repr == NULL)
11580 return NULL;
11581 okind = PyUnicode_KIND(repr);
11582 odata = PyUnicode_DATA(repr);
11583
11584 PyUnicode_WRITE(okind, odata, 0, quote);
11585 PyUnicode_WRITE(okind, odata, osize-1, quote);
11586
11587 for (i = 0, o = 1; i < isize; i++) {
11588 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011589
11590 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 if ((ch == quote) || (ch == '\\')) {
11592 PyUnicode_WRITE(okind, odata, o++, '\\');
11593 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011594 continue;
11595 }
11596
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011598 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 PyUnicode_WRITE(okind, odata, o++, '\\');
11600 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011601 }
11602 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 PyUnicode_WRITE(okind, odata, o++, '\\');
11604 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011605 }
11606 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 PyUnicode_WRITE(okind, odata, o++, '\\');
11608 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011609 }
11610
11611 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011612 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 PyUnicode_WRITE(okind, odata, o++, '\\');
11614 PyUnicode_WRITE(okind, odata, o++, 'x');
11615 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11616 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011617 }
11618
Georg Brandl559e5d72008-06-11 18:37:52 +000011619 /* Copy ASCII characters as-is */
11620 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011622 }
11623
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011625 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011626 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011627 (categories Z* and C* except ASCII space)
11628 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011630 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 if (ch <= 0xff) {
11632 PyUnicode_WRITE(okind, odata, o++, '\\');
11633 PyUnicode_WRITE(okind, odata, o++, 'x');
11634 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11635 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011636 }
11637 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 else if (ch >= 0x10000) {
11639 PyUnicode_WRITE(okind, odata, o++, '\\');
11640 PyUnicode_WRITE(okind, odata, o++, 'U');
11641 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11642 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11643 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11644 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11645 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11646 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11647 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11648 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011649 }
11650 /* Map 16-bit characters to '\uxxxx' */
11651 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652 PyUnicode_WRITE(okind, odata, o++, '\\');
11653 PyUnicode_WRITE(okind, odata, o++, 'u');
11654 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11655 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11656 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11657 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011658 }
11659 }
11660 /* Copy characters as-is */
11661 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011663 }
11664 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011667 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011668 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669}
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673\n\
11674Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011675such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676arguments start and end are interpreted as in slice notation.\n\
11677\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011678Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
11680static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682{
Jesus Ceaac451502011-04-20 17:09:23 +020011683 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011684 Py_ssize_t start;
11685 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011686 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
Jesus Ceaac451502011-04-20 17:09:23 +020011688 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11689 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (PyUnicode_READY(self) == -1)
11693 return NULL;
11694 if (PyUnicode_READY(substring) == -1)
11695 return NULL;
11696
11697 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011698 asciilib_rfind_slice, ucs1lib_rfind_slice,
11699 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011701 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
11703 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 if (result == -2)
11706 return NULL;
11707
Christian Heimes217cfd12007-12-02 14:31:20 +000011708 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709}
11710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011711PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011714Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715
11716static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718{
Jesus Ceaac451502011-04-20 17:09:23 +020011719 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011720 Py_ssize_t start;
11721 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011722 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
Jesus Ceaac451502011-04-20 17:09:23 +020011724 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11725 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (PyUnicode_READY(self) == -1)
11729 return NULL;
11730 if (PyUnicode_READY(substring) == -1)
11731 return NULL;
11732
11733 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011734 asciilib_rfind_slice, ucs1lib_rfind_slice,
11735 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011737 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
11739 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 if (result == -2)
11742 return NULL;
11743
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 if (result < 0) {
11745 PyErr_SetString(PyExc_ValueError, "substring not found");
11746 return NULL;
11747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748
Christian Heimes217cfd12007-12-02 14:31:20 +000011749 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750}
11751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011752PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011755Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011756done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757
11758static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011759unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011761 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 Py_UCS4 fillchar = ' ';
11763
Victor Stinnere9a29352011-10-01 02:14:59 +020011764 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011766
Victor Stinnere9a29352011-10-01 02:14:59 +020011767 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 return NULL;
11769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 Py_INCREF(self);
11772 return (PyObject*) self;
11773 }
11774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776}
11777
Alexander Belopolsky40018472011-02-26 01:02:56 +000011778PyObject *
11779PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780{
11781 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011782
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 s = PyUnicode_FromObject(s);
11784 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011785 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 if (sep != NULL) {
11787 sep = PyUnicode_FromObject(sep);
11788 if (sep == NULL) {
11789 Py_DECREF(s);
11790 return NULL;
11791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 }
11793
Victor Stinner9310abb2011-10-05 00:59:23 +020011794 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795
11796 Py_DECREF(s);
11797 Py_XDECREF(sep);
11798 return result;
11799}
11800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803\n\
11804Return a list of the words in S, using sep as the\n\
11805delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011806splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011807whitespace string is a separator and empty strings are\n\
11808removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809
11810static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011811unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812{
11813 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011814 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
Martin v. Löwis18e16552006-02-15 17:27:45 +000011816 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817 return NULL;
11818
11819 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011822 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825}
11826
Thomas Wouters477c8d52006-05-27 19:21:47 +000011827PyObject *
11828PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11829{
11830 PyObject* str_obj;
11831 PyObject* sep_obj;
11832 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 int kind1, kind2, kind;
11834 void *buf1 = NULL, *buf2 = NULL;
11835 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011836
11837 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011838 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011840 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011842 Py_DECREF(str_obj);
11843 return NULL;
11844 }
11845
Victor Stinner14f8f022011-10-05 20:58:25 +020011846 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011848 kind = Py_MAX(kind1, kind2);
11849 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011851 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (!buf1)
11853 goto onError;
11854 buf2 = PyUnicode_DATA(sep_obj);
11855 if (kind2 != kind)
11856 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11857 if (!buf2)
11858 goto onError;
11859 len1 = PyUnicode_GET_LENGTH(str_obj);
11860 len2 = PyUnicode_GET_LENGTH(sep_obj);
11861
Victor Stinner14f8f022011-10-05 20:58:25 +020011862 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011864 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11865 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11866 else
11867 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 break;
11869 case PyUnicode_2BYTE_KIND:
11870 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11871 break;
11872 case PyUnicode_4BYTE_KIND:
11873 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11874 break;
11875 default:
11876 assert(0);
11877 out = 0;
11878 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011879
11880 Py_DECREF(sep_obj);
11881 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 if (kind1 != kind)
11883 PyMem_Free(buf1);
11884 if (kind2 != kind)
11885 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011886
11887 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 onError:
11889 Py_DECREF(sep_obj);
11890 Py_DECREF(str_obj);
11891 if (kind1 != kind && buf1)
11892 PyMem_Free(buf1);
11893 if (kind2 != kind && buf2)
11894 PyMem_Free(buf2);
11895 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011896}
11897
11898
11899PyObject *
11900PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11901{
11902 PyObject* str_obj;
11903 PyObject* sep_obj;
11904 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 int kind1, kind2, kind;
11906 void *buf1 = NULL, *buf2 = NULL;
11907 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011908
11909 str_obj = PyUnicode_FromObject(str_in);
11910 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011912 sep_obj = PyUnicode_FromObject(sep_in);
11913 if (!sep_obj) {
11914 Py_DECREF(str_obj);
11915 return NULL;
11916 }
11917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 kind1 = PyUnicode_KIND(str_in);
11919 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011920 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 buf1 = PyUnicode_DATA(str_in);
11922 if (kind1 != kind)
11923 buf1 = _PyUnicode_AsKind(str_in, kind);
11924 if (!buf1)
11925 goto onError;
11926 buf2 = PyUnicode_DATA(sep_obj);
11927 if (kind2 != kind)
11928 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11929 if (!buf2)
11930 goto onError;
11931 len1 = PyUnicode_GET_LENGTH(str_obj);
11932 len2 = PyUnicode_GET_LENGTH(sep_obj);
11933
11934 switch(PyUnicode_KIND(str_in)) {
11935 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011936 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11937 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11938 else
11939 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 break;
11941 case PyUnicode_2BYTE_KIND:
11942 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11943 break;
11944 case PyUnicode_4BYTE_KIND:
11945 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11946 break;
11947 default:
11948 assert(0);
11949 out = 0;
11950 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011951
11952 Py_DECREF(sep_obj);
11953 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (kind1 != kind)
11955 PyMem_Free(buf1);
11956 if (kind2 != kind)
11957 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011958
11959 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 onError:
11961 Py_DECREF(sep_obj);
11962 Py_DECREF(str_obj);
11963 if (kind1 != kind && buf1)
11964 PyMem_Free(buf1);
11965 if (kind2 != kind && buf2)
11966 PyMem_Free(buf2);
11967 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011968}
11969
11970PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011972\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011973Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011974the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011975found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011976
11977static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011978unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011979{
Victor Stinner9310abb2011-10-05 00:59:23 +020011980 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011981}
11982
11983PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011984 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011985\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011986Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011987the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011988separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011989
11990static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011991unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011992{
Victor Stinner9310abb2011-10-05 00:59:23 +020011993 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011994}
11995
Alexander Belopolsky40018472011-02-26 01:02:56 +000011996PyObject *
11997PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011998{
11999 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012000
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012001 s = PyUnicode_FromObject(s);
12002 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012003 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 if (sep != NULL) {
12005 sep = PyUnicode_FromObject(sep);
12006 if (sep == NULL) {
12007 Py_DECREF(s);
12008 return NULL;
12009 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012010 }
12011
Victor Stinner9310abb2011-10-05 00:59:23 +020012012 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012013
12014 Py_DECREF(s);
12015 Py_XDECREF(sep);
12016 return result;
12017}
12018
12019PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012021\n\
12022Return a list of the words in S, using sep as the\n\
12023delimiter string, starting at the end of the string and\n\
12024working to the front. If maxsplit is given, at most maxsplit\n\
12025splits are done. If sep is not specified, any whitespace string\n\
12026is a separator.");
12027
12028static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012029unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012030{
12031 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012032 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012033
Martin v. Löwis18e16552006-02-15 17:27:45 +000012034 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012035 return NULL;
12036
12037 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012039 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012040 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012041 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012042 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012043}
12044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012045PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012046 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047\n\
12048Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012049Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012050is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
12052static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012053unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012055 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012056 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012058 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12059 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 return NULL;
12061
Guido van Rossum86662912000-04-11 15:38:46 +000012062 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063}
12064
12065static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012066PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067{
Walter Dörwald346737f2007-05-31 10:44:43 +000012068 if (PyUnicode_CheckExact(self)) {
12069 Py_INCREF(self);
12070 return self;
12071 } else
12072 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012073 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074}
12075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012076PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078\n\
12079Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012080and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
12082static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012083unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085 return fixup(self, fixswapcase);
12086}
12087
Georg Brandlceee0772007-11-27 23:48:05 +000012088PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012089 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012090\n\
12091Return a translation table usable for str.translate().\n\
12092If there is only one argument, it must be a dictionary mapping Unicode\n\
12093ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012094Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012095If there are two arguments, they must be strings of equal length, and\n\
12096in the resulting dictionary, each character in x will be mapped to the\n\
12097character at the same position in y. If there is a third argument, it\n\
12098must be a string, whose characters will be mapped to None in the result.");
12099
12100static PyObject*
12101unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12102{
12103 PyObject *x, *y = NULL, *z = NULL;
12104 PyObject *new = NULL, *key, *value;
12105 Py_ssize_t i = 0;
12106 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012107
Georg Brandlceee0772007-11-27 23:48:05 +000012108 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12109 return NULL;
12110 new = PyDict_New();
12111 if (!new)
12112 return NULL;
12113 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 int x_kind, y_kind, z_kind;
12115 void *x_data, *y_data, *z_data;
12116
Georg Brandlceee0772007-11-27 23:48:05 +000012117 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012118 if (!PyUnicode_Check(x)) {
12119 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12120 "be a string if there is a second argument");
12121 goto err;
12122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012124 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12125 "arguments must have equal length");
12126 goto err;
12127 }
12128 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 x_kind = PyUnicode_KIND(x);
12130 y_kind = PyUnicode_KIND(y);
12131 x_data = PyUnicode_DATA(x);
12132 y_data = PyUnicode_DATA(y);
12133 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12134 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12135 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012136 if (!key || !value)
12137 goto err;
12138 res = PyDict_SetItem(new, key, value);
12139 Py_DECREF(key);
12140 Py_DECREF(value);
12141 if (res < 0)
12142 goto err;
12143 }
12144 /* create entries for deleting chars in z */
12145 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 z_kind = PyUnicode_KIND(z);
12147 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012148 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012150 if (!key)
12151 goto err;
12152 res = PyDict_SetItem(new, key, Py_None);
12153 Py_DECREF(key);
12154 if (res < 0)
12155 goto err;
12156 }
12157 }
12158 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 int kind;
12160 void *data;
12161
Georg Brandlceee0772007-11-27 23:48:05 +000012162 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012163 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012164 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12165 "to maketrans it must be a dict");
12166 goto err;
12167 }
12168 /* copy entries into the new dict, converting string keys to int keys */
12169 while (PyDict_Next(x, &i, &key, &value)) {
12170 if (PyUnicode_Check(key)) {
12171 /* convert string keys to integer keys */
12172 PyObject *newkey;
12173 if (PyUnicode_GET_SIZE(key) != 1) {
12174 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12175 "table must be of length 1");
12176 goto err;
12177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 kind = PyUnicode_KIND(key);
12179 data = PyUnicode_DATA(key);
12180 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012181 if (!newkey)
12182 goto err;
12183 res = PyDict_SetItem(new, newkey, value);
12184 Py_DECREF(newkey);
12185 if (res < 0)
12186 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012187 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012188 /* just keep integer keys */
12189 if (PyDict_SetItem(new, key, value) < 0)
12190 goto err;
12191 } else {
12192 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12193 "be strings or integers");
12194 goto err;
12195 }
12196 }
12197 }
12198 return new;
12199 err:
12200 Py_DECREF(new);
12201 return NULL;
12202}
12203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012204PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012205 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206\n\
12207Return a copy of the string S, where all characters have been mapped\n\
12208through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012209Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012210Unmapped characters are left untouched. Characters mapped to None\n\
12211are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
12213static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217}
12218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012219PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012222Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223
12224static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012225unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 return fixup(self, fixupper);
12228}
12229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012230PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012233Pad a numeric string S with zeros on the left, to fill a field\n\
12234of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235
12236static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012237unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012239 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012240 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012241 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 int kind;
12243 void *data;
12244 Py_UCS4 chr;
12245
12246 if (PyUnicode_READY(self) == -1)
12247 return NULL;
12248
Martin v. Löwis18e16552006-02-15 17:27:45 +000012249 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250 return NULL;
12251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012253 if (PyUnicode_CheckExact(self)) {
12254 Py_INCREF(self);
12255 return (PyObject*) self;
12256 }
12257 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012258 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259 }
12260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
12263 u = pad(self, fill, 0, '0');
12264
Walter Dörwald068325e2002-04-15 13:36:47 +000012265 if (u == NULL)
12266 return NULL;
12267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 kind = PyUnicode_KIND(u);
12269 data = PyUnicode_DATA(u);
12270 chr = PyUnicode_READ(kind, data, fill);
12271
12272 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 PyUnicode_WRITE(kind, data, 0, chr);
12275 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 }
12277
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012278 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279 return (PyObject*) u;
12280}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281
12282#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012283static PyObject *
12284unicode__decimal2ascii(PyObject *self)
12285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012287}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288#endif
12289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012290PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012293Return True if S starts with the specified prefix, False otherwise.\n\
12294With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012295With optional end, stop comparing S at that position.\n\
12296prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297
12298static PyObject *
12299unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012302 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012304 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012305 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012306 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
Jesus Ceaac451502011-04-20 17:09:23 +020012308 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012310 if (PyTuple_Check(subobj)) {
12311 Py_ssize_t i;
12312 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12313 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012315 if (substring == NULL)
12316 return NULL;
12317 result = tailmatch(self, substring, start, end, -1);
12318 Py_DECREF(substring);
12319 if (result) {
12320 Py_RETURN_TRUE;
12321 }
12322 }
12323 /* nothing matched */
12324 Py_RETURN_FALSE;
12325 }
12326 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012327 if (substring == NULL) {
12328 if (PyErr_ExceptionMatches(PyExc_TypeError))
12329 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12330 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012332 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012333 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012335 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336}
12337
12338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012339PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012340 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012342Return True if S ends with the specified suffix, False otherwise.\n\
12343With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012344With optional end, stop comparing S at that position.\n\
12345suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346
12347static PyObject *
12348unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012351 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012353 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012354 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012355 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356
Jesus Ceaac451502011-04-20 17:09:23 +020012357 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012359 if (PyTuple_Check(subobj)) {
12360 Py_ssize_t i;
12361 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12362 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012364 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012366 result = tailmatch(self, substring, start, end, +1);
12367 Py_DECREF(substring);
12368 if (result) {
12369 Py_RETURN_TRUE;
12370 }
12371 }
12372 Py_RETURN_FALSE;
12373 }
12374 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012375 if (substring == NULL) {
12376 if (PyErr_ExceptionMatches(PyExc_TypeError))
12377 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12378 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012380 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012381 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012383 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384}
12385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012387
12388PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012390\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012391Return a formatted version of S, using substitutions from args and kwargs.\n\
12392The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012393
Eric Smith27bbca62010-11-04 17:06:58 +000012394PyDoc_STRVAR(format_map__doc__,
12395 "S.format_map(mapping) -> str\n\
12396\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012397Return a formatted version of S, using substitutions from mapping.\n\
12398The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012399
Eric Smith4a7d76d2008-05-30 18:10:19 +000012400static PyObject *
12401unicode__format__(PyObject* self, PyObject* args)
12402{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012403 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012404
12405 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12406 return NULL;
12407
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012408 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012410 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012411}
12412
Eric Smith8c663262007-08-25 02:26:07 +000012413PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012415\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012416Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012417
12418static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012419unicode__sizeof__(PyUnicodeObject *v)
12420{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 Py_ssize_t size;
12422
12423 /* If it's a compact object, account for base structure +
12424 character data. */
12425 if (PyUnicode_IS_COMPACT_ASCII(v))
12426 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12427 else if (PyUnicode_IS_COMPACT(v))
12428 size = sizeof(PyCompactUnicodeObject) +
12429 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12430 else {
12431 /* If it is a two-block object, account for base object, and
12432 for character block if present. */
12433 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012434 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 size += (PyUnicode_GET_LENGTH(v) + 1) *
12436 PyUnicode_CHARACTER_SIZE(v);
12437 }
12438 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012439 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012440 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012442 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012443 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444
12445 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012446}
12447
12448PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012450
12451static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012452unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012453{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012454 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 if (!copy)
12456 return NULL;
12457 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012458}
12459
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460static PyMethodDef unicode_methods[] = {
12461
12462 /* Order is according to common usage: often used methods should
12463 appear first, since lookup is done sequentially. */
12464
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012465 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012466 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12467 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012468 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012469 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12470 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12471 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12472 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12473 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12474 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12475 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012476 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012477 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12478 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12479 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012480 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012481 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12482 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12483 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012484 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012485 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012486 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012487 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012488 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12489 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12490 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12491 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12492 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12493 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12494 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12495 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12496 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12497 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12498 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12499 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12500 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12501 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012502 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012503 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012504 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012505 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012506 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012507 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012508 {"maketrans", (PyCFunction) unicode_maketrans,
12509 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012510 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012511#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012512 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513#endif
12514
12515#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012516 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012517 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518#endif
12519
Benjamin Peterson14339b62009-01-31 16:36:08 +000012520 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521 {NULL, NULL}
12522};
12523
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012524static PyObject *
12525unicode_mod(PyObject *v, PyObject *w)
12526{
Brian Curtindfc80e32011-08-10 20:28:54 -050012527 if (!PyUnicode_Check(v))
12528 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012530}
12531
12532static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012533 0, /*nb_add*/
12534 0, /*nb_subtract*/
12535 0, /*nb_multiply*/
12536 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012537};
12538
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012540 (lenfunc) unicode_length, /* sq_length */
12541 PyUnicode_Concat, /* sq_concat */
12542 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12543 (ssizeargfunc) unicode_getitem, /* sq_item */
12544 0, /* sq_slice */
12545 0, /* sq_ass_item */
12546 0, /* sq_ass_slice */
12547 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548};
12549
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012550static PyObject*
12551unicode_subscript(PyUnicodeObject* self, PyObject* item)
12552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 if (PyUnicode_READY(self) == -1)
12554 return NULL;
12555
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012556 if (PyIndex_Check(item)) {
12557 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012558 if (i == -1 && PyErr_Occurred())
12559 return NULL;
12560 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012562 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012563 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012564 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012565 PyObject *result;
12566 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012567 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012568 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012572 return NULL;
12573 }
12574
12575 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 return PyUnicode_New(0, 0);
12577 } else if (start == 0 && step == 1 &&
12578 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012579 PyUnicode_CheckExact(self)) {
12580 Py_INCREF(self);
12581 return (PyObject *)self;
12582 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012583 return PyUnicode_Substring((PyObject*)self,
12584 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012585 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012586 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012587 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012588 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012589 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012590 src_data = PyUnicode_DATA(self);
12591 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12592 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012593 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012594 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012595 if (max_char >= kind_limit)
12596 break;
12597 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012598 }
12599 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012600 if (result == NULL)
12601 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012602 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012603 dest_data = PyUnicode_DATA(result);
12604
12605 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012606 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12607 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012608 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012609 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012610 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012611 } else {
12612 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12613 return NULL;
12614 }
12615}
12616
12617static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012618 (lenfunc)unicode_length, /* mp_length */
12619 (binaryfunc)unicode_subscript, /* mp_subscript */
12620 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012621};
12622
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624/* Helpers for PyUnicode_Format() */
12625
12626static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012627getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012629 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 (*p_argidx)++;
12632 if (arglen < 0)
12633 return args;
12634 else
12635 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 }
12637 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012638 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639 return NULL;
12640}
12641
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012642/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012644static PyObject *
12645formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012647 char *p;
12648 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012650
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651 x = PyFloat_AsDouble(v);
12652 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012653 return NULL;
12654
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012656 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012657
Eric Smith0923d1d2009-04-16 20:16:10 +000012658 p = PyOS_double_to_string(x, type, prec,
12659 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012660 if (p == NULL)
12661 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012663 PyMem_Free(p);
12664 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665}
12666
Tim Peters38fd5b62000-09-21 05:43:11 +000012667static PyObject*
12668formatlong(PyObject *val, int flags, int prec, int type)
12669{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012670 char *buf;
12671 int len;
12672 PyObject *str; /* temporary string object. */
12673 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012674
Benjamin Peterson14339b62009-01-31 16:36:08 +000012675 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12676 if (!str)
12677 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012679 Py_DECREF(str);
12680 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012681}
12682
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012685 size_t buflen,
12686 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012688 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012689 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 if (PyUnicode_GET_LENGTH(v) == 1) {
12691 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012692 buf[1] = '\0';
12693 return 1;
12694 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 goto onError;
12696 }
12697 else {
12698 /* Integer input truncated to a character */
12699 long x;
12700 x = PyLong_AsLong(v);
12701 if (x == -1 && PyErr_Occurred())
12702 goto onError;
12703
12704 if (x < 0 || x > 0x10ffff) {
12705 PyErr_SetString(PyExc_OverflowError,
12706 "%c arg not in range(0x110000)");
12707 return -1;
12708 }
12709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012711 buf[1] = '\0';
12712 return 1;
12713 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012714
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012716 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012718 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719}
12720
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012721/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012722 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012723*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012724#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012725
Alexander Belopolsky40018472011-02-26 01:02:56 +000012726PyObject *
12727PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 void *fmt;
12730 int fmtkind;
12731 PyObject *result;
12732 Py_UCS4 *res, *res0;
12733 Py_UCS4 max;
12734 int kind;
12735 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012739
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 PyErr_BadInternalCall();
12742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12745 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 fmt = PyUnicode_DATA(uformat);
12748 fmtkind = PyUnicode_KIND(uformat);
12749 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12750 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751
12752 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12754 if (res0 == NULL) {
12755 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758
12759 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012760 arglen = PyTuple_Size(args);
12761 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762 }
12763 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 arglen = -1;
12765 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012767 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012768 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012769 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770
12771 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 if (--rescnt < 0) {
12774 rescnt = fmtcnt + 100;
12775 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12777 if (res0 == NULL){
12778 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 }
12781 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012782 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012785 }
12786 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 /* Got a format specifier */
12788 int flags = 0;
12789 Py_ssize_t width = -1;
12790 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 Py_UCS4 c = '\0';
12792 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012793 int isnumok;
12794 PyObject *v = NULL;
12795 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 void *pbuf;
12797 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 Py_ssize_t len, len1;
12800 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 fmtpos++;
12803 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12804 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 Py_ssize_t keylen;
12806 PyObject *key;
12807 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012808
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 if (dict == NULL) {
12810 PyErr_SetString(PyExc_TypeError,
12811 "format requires a mapping");
12812 goto onError;
12813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012817 /* Skip over balanced parentheses */
12818 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 if (fmtcnt < 0 || pcount > 0) {
12827 PyErr_SetString(PyExc_ValueError,
12828 "incomplete format key");
12829 goto onError;
12830 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012831 key = PyUnicode_Substring((PyObject*)uformat,
12832 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 if (key == NULL)
12834 goto onError;
12835 if (args_owned) {
12836 Py_DECREF(args);
12837 args_owned = 0;
12838 }
12839 args = PyObject_GetItem(dict, key);
12840 Py_DECREF(key);
12841 if (args == NULL) {
12842 goto onError;
12843 }
12844 args_owned = 1;
12845 arglen = -1;
12846 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012847 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 case '-': flags |= F_LJUST; continue;
12851 case '+': flags |= F_SIGN; continue;
12852 case ' ': flags |= F_BLANK; continue;
12853 case '#': flags |= F_ALT; continue;
12854 case '0': flags |= F_ZERO; continue;
12855 }
12856 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012857 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012858 if (c == '*') {
12859 v = getnextarg(args, arglen, &argidx);
12860 if (v == NULL)
12861 goto onError;
12862 if (!PyLong_Check(v)) {
12863 PyErr_SetString(PyExc_TypeError,
12864 "* wants int");
12865 goto onError;
12866 }
12867 width = PyLong_AsLong(v);
12868 if (width == -1 && PyErr_Occurred())
12869 goto onError;
12870 if (width < 0) {
12871 flags |= F_LJUST;
12872 width = -width;
12873 }
12874 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012876 }
12877 else if (c >= '0' && c <= '9') {
12878 width = c - '0';
12879 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012881 if (c < '0' || c > '9')
12882 break;
12883 if ((width*10) / 10 != width) {
12884 PyErr_SetString(PyExc_ValueError,
12885 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012886 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012887 }
12888 width = width*10 + (c - '0');
12889 }
12890 }
12891 if (c == '.') {
12892 prec = 0;
12893 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012895 if (c == '*') {
12896 v = getnextarg(args, arglen, &argidx);
12897 if (v == NULL)
12898 goto onError;
12899 if (!PyLong_Check(v)) {
12900 PyErr_SetString(PyExc_TypeError,
12901 "* wants int");
12902 goto onError;
12903 }
12904 prec = PyLong_AsLong(v);
12905 if (prec == -1 && PyErr_Occurred())
12906 goto onError;
12907 if (prec < 0)
12908 prec = 0;
12909 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012911 }
12912 else if (c >= '0' && c <= '9') {
12913 prec = c - '0';
12914 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012916 if (c < '0' || c > '9')
12917 break;
12918 if ((prec*10) / 10 != prec) {
12919 PyErr_SetString(PyExc_ValueError,
12920 "prec too big");
12921 goto onError;
12922 }
12923 prec = prec*10 + (c - '0');
12924 }
12925 }
12926 } /* prec */
12927 if (fmtcnt >= 0) {
12928 if (c == 'h' || c == 'l' || c == 'L') {
12929 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 }
12932 }
12933 if (fmtcnt < 0) {
12934 PyErr_SetString(PyExc_ValueError,
12935 "incomplete format");
12936 goto onError;
12937 }
12938 if (c != '%') {
12939 v = getnextarg(args, arglen, &argidx);
12940 if (v == NULL)
12941 goto onError;
12942 }
12943 sign = 0;
12944 fill = ' ';
12945 switch (c) {
12946
12947 case '%':
12948 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012952 len = 1;
12953 break;
12954
12955 case 's':
12956 case 'r':
12957 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012958 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012959 temp = v;
12960 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012961 }
12962 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 if (c == 's')
12964 temp = PyObject_Str(v);
12965 else if (c == 'r')
12966 temp = PyObject_Repr(v);
12967 else
12968 temp = PyObject_ASCII(v);
12969 if (temp == NULL)
12970 goto onError;
12971 if (PyUnicode_Check(temp))
12972 /* nothing to do */;
12973 else {
12974 Py_DECREF(temp);
12975 PyErr_SetString(PyExc_TypeError,
12976 "%s argument has non-string str()");
12977 goto onError;
12978 }
12979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 if (PyUnicode_READY(temp) == -1) {
12981 Py_CLEAR(temp);
12982 goto onError;
12983 }
12984 pbuf = PyUnicode_DATA(temp);
12985 kind = PyUnicode_KIND(temp);
12986 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 if (prec >= 0 && len > prec)
12988 len = prec;
12989 break;
12990
12991 case 'i':
12992 case 'd':
12993 case 'u':
12994 case 'o':
12995 case 'x':
12996 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 isnumok = 0;
12998 if (PyNumber_Check(v)) {
12999 PyObject *iobj=NULL;
13000
13001 if (PyLong_Check(v)) {
13002 iobj = v;
13003 Py_INCREF(iobj);
13004 }
13005 else {
13006 iobj = PyNumber_Long(v);
13007 }
13008 if (iobj!=NULL) {
13009 if (PyLong_Check(iobj)) {
13010 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013011 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 Py_DECREF(iobj);
13013 if (!temp)
13014 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 if (PyUnicode_READY(temp) == -1) {
13016 Py_CLEAR(temp);
13017 goto onError;
13018 }
13019 pbuf = PyUnicode_DATA(temp);
13020 kind = PyUnicode_KIND(temp);
13021 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013022 sign = 1;
13023 }
13024 else {
13025 Py_DECREF(iobj);
13026 }
13027 }
13028 }
13029 if (!isnumok) {
13030 PyErr_Format(PyExc_TypeError,
13031 "%%%c format: a number is required, "
13032 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13033 goto onError;
13034 }
13035 if (flags & F_ZERO)
13036 fill = '0';
13037 break;
13038
13039 case 'e':
13040 case 'E':
13041 case 'f':
13042 case 'F':
13043 case 'g':
13044 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013045 temp = formatfloat(v, flags, prec, c);
13046 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013047 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 if (PyUnicode_READY(temp) == -1) {
13049 Py_CLEAR(temp);
13050 goto onError;
13051 }
13052 pbuf = PyUnicode_DATA(temp);
13053 kind = PyUnicode_KIND(temp);
13054 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 sign = 1;
13056 if (flags & F_ZERO)
13057 fill = '0';
13058 break;
13059
13060 case 'c':
13061 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020013063 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013064 if (len < 0)
13065 goto onError;
13066 break;
13067
13068 default:
13069 PyErr_Format(PyExc_ValueError,
13070 "unsupported format character '%c' (0x%x) "
13071 "at index %zd",
13072 (31<=c && c<=126) ? (char)c : '?',
13073 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013075 goto onError;
13076 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 /* pbuf is initialized here. */
13078 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13081 PyUnicode_READ(kind, pbuf, pindex) == '+') {
13082 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013083 len--;
13084 }
13085 else if (flags & F_SIGN)
13086 sign = '+';
13087 else if (flags & F_BLANK)
13088 sign = ' ';
13089 else
13090 sign = 0;
13091 }
13092 if (width < len)
13093 width = len;
13094 if (rescnt - (sign != 0) < width) {
13095 reslen -= rescnt;
13096 rescnt = width + fmtcnt + 100;
13097 reslen += rescnt;
13098 if (reslen < 0) {
13099 Py_XDECREF(temp);
13100 PyErr_NoMemory();
13101 goto onError;
13102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13104 if (res0 == 0) {
13105 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 Py_XDECREF(temp);
13107 goto onError;
13108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013109 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 }
13111 if (sign) {
13112 if (fill != ' ')
13113 *res++ = sign;
13114 rescnt--;
13115 if (width > len)
13116 width--;
13117 }
13118 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13120 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013121 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13123 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 }
13125 rescnt -= 2;
13126 width -= 2;
13127 if (width < 0)
13128 width = 0;
13129 len -= 2;
13130 }
13131 if (width > len && !(flags & F_LJUST)) {
13132 do {
13133 --rescnt;
13134 *res++ = fill;
13135 } while (--width > len);
13136 }
13137 if (fill == ' ') {
13138 if (sign)
13139 *res++ = sign;
13140 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13142 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13143 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13144 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013145 }
13146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 /* Copy all characters, preserving len */
13148 len1 = len;
13149 while (len1--) {
13150 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13151 rescnt--;
13152 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 while (--width >= len) {
13154 --rescnt;
13155 *res++ = ' ';
13156 }
13157 if (dict && (argidx < arglen) && c != '%') {
13158 PyErr_SetString(PyExc_TypeError,
13159 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013160 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 goto onError;
13162 }
13163 Py_XDECREF(temp);
13164 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165 } /* until end */
13166 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 PyErr_SetString(PyExc_TypeError,
13168 "not all arguments converted during string formatting");
13169 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170 }
13171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172
13173 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13174 if (*res > max)
13175 max = *res;
13176 result = PyUnicode_New(reslen - rescnt, max);
13177 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 kind = PyUnicode_KIND(result);
13180 for (res = res0; res < res0+reslen-rescnt; res++)
13181 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13182 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185 }
13186 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013187 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188 return (PyObject *)result;
13189
Benjamin Peterson29060642009-01-31 22:14:21 +000013190 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192 Py_DECREF(uformat);
13193 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195 }
13196 return NULL;
13197}
13198
Jeremy Hylton938ace62002-07-17 16:30:39 +000013199static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013200unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13201
Tim Peters6d6c1a32001-08-02 04:15:00 +000013202static PyObject *
13203unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13204{
Benjamin Peterson29060642009-01-31 22:14:21 +000013205 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013206 static char *kwlist[] = {"object", "encoding", "errors", 0};
13207 char *encoding = NULL;
13208 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013209
Benjamin Peterson14339b62009-01-31 16:36:08 +000013210 if (type != &PyUnicode_Type)
13211 return unicode_subtype_new(type, args, kwds);
13212 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013214 return NULL;
13215 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013217 if (encoding == NULL && errors == NULL)
13218 return PyObject_Str(x);
13219 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013221}
13222
Guido van Rossume023fe02001-08-30 03:12:59 +000013223static PyObject *
13224unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13225{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013226 PyUnicodeObject *unicode, *self;
13227 Py_ssize_t length, char_size;
13228 int share_wstr, share_utf8;
13229 unsigned int kind;
13230 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013231
Benjamin Peterson14339b62009-01-31 16:36:08 +000013232 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013233
13234 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13235 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013236 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013237 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013238 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013239 return NULL;
13240
13241 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13242 if (self == NULL) {
13243 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 return NULL;
13245 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013246 kind = PyUnicode_KIND(unicode);
13247 length = PyUnicode_GET_LENGTH(unicode);
13248
13249 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013250#ifdef Py_DEBUG
13251 _PyUnicode_HASH(self) = -1;
13252#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013253 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013254#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013255 _PyUnicode_STATE(self).interned = 0;
13256 _PyUnicode_STATE(self).kind = kind;
13257 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013258 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013259 _PyUnicode_STATE(self).ready = 1;
13260 _PyUnicode_WSTR(self) = NULL;
13261 _PyUnicode_UTF8_LENGTH(self) = 0;
13262 _PyUnicode_UTF8(self) = NULL;
13263 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013264 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013265
13266 share_utf8 = 0;
13267 share_wstr = 0;
13268 if (kind == PyUnicode_1BYTE_KIND) {
13269 char_size = 1;
13270 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13271 share_utf8 = 1;
13272 }
13273 else if (kind == PyUnicode_2BYTE_KIND) {
13274 char_size = 2;
13275 if (sizeof(wchar_t) == 2)
13276 share_wstr = 1;
13277 }
13278 else {
13279 assert(kind == PyUnicode_4BYTE_KIND);
13280 char_size = 4;
13281 if (sizeof(wchar_t) == 4)
13282 share_wstr = 1;
13283 }
13284
13285 /* Ensure we won't overflow the length. */
13286 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13287 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013289 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013290 data = PyObject_MALLOC((length + 1) * char_size);
13291 if (data == NULL) {
13292 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 goto onError;
13294 }
13295
Victor Stinnerc3c74152011-10-02 20:39:55 +020013296 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013297 if (share_utf8) {
13298 _PyUnicode_UTF8_LENGTH(self) = length;
13299 _PyUnicode_UTF8(self) = data;
13300 }
13301 if (share_wstr) {
13302 _PyUnicode_WSTR_LENGTH(self) = length;
13303 _PyUnicode_WSTR(self) = (wchar_t *)data;
13304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013306 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13307 PyUnicode_KIND_SIZE(kind, length + 1));
13308 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013309 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013310#ifdef Py_DEBUG
13311 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13312#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013313 return (PyObject *)self;
13314
13315onError:
13316 Py_DECREF(unicode);
13317 Py_DECREF(self);
13318 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013319}
13320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013321PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013322 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013323\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013324Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013325encoding defaults to the current default string encoding.\n\
13326errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013327
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013328static PyObject *unicode_iter(PyObject *seq);
13329
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013331 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013332 "str", /* tp_name */
13333 sizeof(PyUnicodeObject), /* tp_size */
13334 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013336 (destructor)unicode_dealloc, /* tp_dealloc */
13337 0, /* tp_print */
13338 0, /* tp_getattr */
13339 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013340 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013341 unicode_repr, /* tp_repr */
13342 &unicode_as_number, /* tp_as_number */
13343 &unicode_as_sequence, /* tp_as_sequence */
13344 &unicode_as_mapping, /* tp_as_mapping */
13345 (hashfunc) unicode_hash, /* tp_hash*/
13346 0, /* tp_call*/
13347 (reprfunc) unicode_str, /* tp_str */
13348 PyObject_GenericGetAttr, /* tp_getattro */
13349 0, /* tp_setattro */
13350 0, /* tp_as_buffer */
13351 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013353 unicode_doc, /* tp_doc */
13354 0, /* tp_traverse */
13355 0, /* tp_clear */
13356 PyUnicode_RichCompare, /* tp_richcompare */
13357 0, /* tp_weaklistoffset */
13358 unicode_iter, /* tp_iter */
13359 0, /* tp_iternext */
13360 unicode_methods, /* tp_methods */
13361 0, /* tp_members */
13362 0, /* tp_getset */
13363 &PyBaseObject_Type, /* tp_base */
13364 0, /* tp_dict */
13365 0, /* tp_descr_get */
13366 0, /* tp_descr_set */
13367 0, /* tp_dictoffset */
13368 0, /* tp_init */
13369 0, /* tp_alloc */
13370 unicode_new, /* tp_new */
13371 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372};
13373
13374/* Initialize the Unicode implementation */
13375
Thomas Wouters78890102000-07-22 19:25:51 +000013376void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013378 int i;
13379
Thomas Wouters477c8d52006-05-27 19:21:47 +000013380 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013381 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013382 0x000A, /* LINE FEED */
13383 0x000D, /* CARRIAGE RETURN */
13384 0x001C, /* FILE SEPARATOR */
13385 0x001D, /* GROUP SEPARATOR */
13386 0x001E, /* RECORD SEPARATOR */
13387 0x0085, /* NEXT LINE */
13388 0x2028, /* LINE SEPARATOR */
13389 0x2029, /* PARAGRAPH SEPARATOR */
13390 };
13391
Fred Drakee4315f52000-05-09 19:53:39 +000013392 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013393 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013394 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013395 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013396 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013397
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013398 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013400 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013402
13403 /* initialize the linebreak bloom filter */
13404 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013405 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013406 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013407
13408 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409}
13410
13411/* Finalize the Unicode implementation */
13412
Christian Heimesa156e092008-02-16 07:38:31 +000013413int
13414PyUnicode_ClearFreeList(void)
13415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013417}
13418
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419void
Thomas Wouters78890102000-07-22 19:25:51 +000013420_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013421{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013422 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013424 Py_XDECREF(unicode_empty);
13425 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013426
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013427 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 if (unicode_latin1[i]) {
13429 Py_DECREF(unicode_latin1[i]);
13430 unicode_latin1[i] = NULL;
13431 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013432 }
Christian Heimesa156e092008-02-16 07:38:31 +000013433 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013435
Walter Dörwald16807132007-05-25 13:52:07 +000013436void
13437PyUnicode_InternInPlace(PyObject **p)
13438{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013439 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13440 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013441#ifdef Py_DEBUG
13442 assert(s != NULL);
13443 assert(_PyUnicode_CHECK(s));
13444#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013445 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013446 return;
13447#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013448 /* If it's a subclass, we don't really know what putting
13449 it in the interned dict might do. */
13450 if (!PyUnicode_CheckExact(s))
13451 return;
13452 if (PyUnicode_CHECK_INTERNED(s))
13453 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013454 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013455 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013456 return;
13457 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013458 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013459 if (interned == NULL) {
13460 interned = PyDict_New();
13461 if (interned == NULL) {
13462 PyErr_Clear(); /* Don't leave an exception */
13463 return;
13464 }
13465 }
13466 /* It might be that the GetItem call fails even
13467 though the key is present in the dictionary,
13468 namely when this happens during a stack overflow. */
13469 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013471 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013472
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 if (t) {
13474 Py_INCREF(t);
13475 Py_DECREF(*p);
13476 *p = t;
13477 return;
13478 }
Walter Dörwald16807132007-05-25 13:52:07 +000013479
Benjamin Peterson14339b62009-01-31 16:36:08 +000013480 PyThreadState_GET()->recursion_critical = 1;
13481 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13482 PyErr_Clear();
13483 PyThreadState_GET()->recursion_critical = 0;
13484 return;
13485 }
13486 PyThreadState_GET()->recursion_critical = 0;
13487 /* The two references in interned are not counted by refcnt.
13488 The deallocator will take care of this */
13489 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013490 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013491}
13492
13493void
13494PyUnicode_InternImmortal(PyObject **p)
13495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13497
Benjamin Peterson14339b62009-01-31 16:36:08 +000013498 PyUnicode_InternInPlace(p);
13499 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013501 Py_INCREF(*p);
13502 }
Walter Dörwald16807132007-05-25 13:52:07 +000013503}
13504
13505PyObject *
13506PyUnicode_InternFromString(const char *cp)
13507{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013508 PyObject *s = PyUnicode_FromString(cp);
13509 if (s == NULL)
13510 return NULL;
13511 PyUnicode_InternInPlace(&s);
13512 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013513}
13514
Alexander Belopolsky40018472011-02-26 01:02:56 +000013515void
13516_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013517{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013518 PyObject *keys;
13519 PyUnicodeObject *s;
13520 Py_ssize_t i, n;
13521 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013522
Benjamin Peterson14339b62009-01-31 16:36:08 +000013523 if (interned == NULL || !PyDict_Check(interned))
13524 return;
13525 keys = PyDict_Keys(interned);
13526 if (keys == NULL || !PyList_Check(keys)) {
13527 PyErr_Clear();
13528 return;
13529 }
Walter Dörwald16807132007-05-25 13:52:07 +000013530
Benjamin Peterson14339b62009-01-31 16:36:08 +000013531 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13532 detector, interned unicode strings are not forcibly deallocated;
13533 rather, we give them their stolen references back, and then clear
13534 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013535
Benjamin Peterson14339b62009-01-31 16:36:08 +000013536 n = PyList_GET_SIZE(keys);
13537 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013539 for (i = 0; i < n; i++) {
13540 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013541 if (PyUnicode_READY(s) == -1) {
13542 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013543 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013545 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013546 case SSTATE_NOT_INTERNED:
13547 /* XXX Shouldn't happen */
13548 break;
13549 case SSTATE_INTERNED_IMMORTAL:
13550 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013551 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013552 break;
13553 case SSTATE_INTERNED_MORTAL:
13554 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013556 break;
13557 default:
13558 Py_FatalError("Inconsistent interned string state.");
13559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013560 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013561 }
13562 fprintf(stderr, "total size of all interned strings: "
13563 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13564 "mortal/immortal\n", mortal_size, immortal_size);
13565 Py_DECREF(keys);
13566 PyDict_Clear(interned);
13567 Py_DECREF(interned);
13568 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013569}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013570
13571
13572/********************* Unicode Iterator **************************/
13573
13574typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013575 PyObject_HEAD
13576 Py_ssize_t it_index;
13577 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013578} unicodeiterobject;
13579
13580static void
13581unicodeiter_dealloc(unicodeiterobject *it)
13582{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013583 _PyObject_GC_UNTRACK(it);
13584 Py_XDECREF(it->it_seq);
13585 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013586}
13587
13588static int
13589unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13590{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013591 Py_VISIT(it->it_seq);
13592 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013593}
13594
13595static PyObject *
13596unicodeiter_next(unicodeiterobject *it)
13597{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013598 PyUnicodeObject *seq;
13599 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013600
Benjamin Peterson14339b62009-01-31 16:36:08 +000013601 assert(it != NULL);
13602 seq = it->it_seq;
13603 if (seq == NULL)
13604 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013605 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13608 int kind = PyUnicode_KIND(seq);
13609 void *data = PyUnicode_DATA(seq);
13610 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13611 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013612 if (item != NULL)
13613 ++it->it_index;
13614 return item;
13615 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013616
Benjamin Peterson14339b62009-01-31 16:36:08 +000013617 Py_DECREF(seq);
13618 it->it_seq = NULL;
13619 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013620}
13621
13622static PyObject *
13623unicodeiter_len(unicodeiterobject *it)
13624{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013625 Py_ssize_t len = 0;
13626 if (it->it_seq)
13627 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13628 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013629}
13630
13631PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13632
13633static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013634 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013636 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013637};
13638
13639PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013640 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13641 "str_iterator", /* tp_name */
13642 sizeof(unicodeiterobject), /* tp_basicsize */
13643 0, /* tp_itemsize */
13644 /* methods */
13645 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13646 0, /* tp_print */
13647 0, /* tp_getattr */
13648 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013649 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013650 0, /* tp_repr */
13651 0, /* tp_as_number */
13652 0, /* tp_as_sequence */
13653 0, /* tp_as_mapping */
13654 0, /* tp_hash */
13655 0, /* tp_call */
13656 0, /* tp_str */
13657 PyObject_GenericGetAttr, /* tp_getattro */
13658 0, /* tp_setattro */
13659 0, /* tp_as_buffer */
13660 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13661 0, /* tp_doc */
13662 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13663 0, /* tp_clear */
13664 0, /* tp_richcompare */
13665 0, /* tp_weaklistoffset */
13666 PyObject_SelfIter, /* tp_iter */
13667 (iternextfunc)unicodeiter_next, /* tp_iternext */
13668 unicodeiter_methods, /* tp_methods */
13669 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013670};
13671
13672static PyObject *
13673unicode_iter(PyObject *seq)
13674{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013675 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013676
Benjamin Peterson14339b62009-01-31 16:36:08 +000013677 if (!PyUnicode_Check(seq)) {
13678 PyErr_BadInternalCall();
13679 return NULL;
13680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013681 if (PyUnicode_READY(seq) == -1)
13682 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013683 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13684 if (it == NULL)
13685 return NULL;
13686 it->it_index = 0;
13687 Py_INCREF(seq);
13688 it->it_seq = (PyUnicodeObject *)seq;
13689 _PyObject_GC_TRACK(it);
13690 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013691}
13692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013693#define UNIOP(x) Py_UNICODE_##x
13694#define UNIOP_t Py_UNICODE
13695#include "uniops.h"
13696#undef UNIOP
13697#undef UNIOP_t
13698#define UNIOP(x) Py_UCS4_##x
13699#define UNIOP_t Py_UCS4
13700#include "uniops.h"
13701#undef UNIOP
13702#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013703
Victor Stinner71133ff2010-09-01 23:43:53 +000013704Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013705PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013706{
13707 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13708 Py_UNICODE *copy;
13709 Py_ssize_t size;
13710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013711 if (!PyUnicode_Check(unicode)) {
13712 PyErr_BadArgument();
13713 return NULL;
13714 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013715 /* Ensure we won't overflow the size. */
13716 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13717 PyErr_NoMemory();
13718 return NULL;
13719 }
13720 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13721 size *= sizeof(Py_UNICODE);
13722 copy = PyMem_Malloc(size);
13723 if (copy == NULL) {
13724 PyErr_NoMemory();
13725 return NULL;
13726 }
13727 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13728 return copy;
13729}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013730
Georg Brandl66c221e2010-10-14 07:04:07 +000013731/* A _string module, to export formatter_parser and formatter_field_name_split
13732 to the string.Formatter class implemented in Python. */
13733
13734static PyMethodDef _string_methods[] = {
13735 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13736 METH_O, PyDoc_STR("split the argument as a field name")},
13737 {"formatter_parser", (PyCFunction) formatter_parser,
13738 METH_O, PyDoc_STR("parse the argument as a format string")},
13739 {NULL, NULL}
13740};
13741
13742static struct PyModuleDef _string_module = {
13743 PyModuleDef_HEAD_INIT,
13744 "_string",
13745 PyDoc_STR("string helper module"),
13746 0,
13747 _string_methods,
13748 NULL,
13749 NULL,
13750 NULL,
13751 NULL
13752};
13753
13754PyMODINIT_FUNC
13755PyInit__string(void)
13756{
13757 return PyModule_Create(&_string_module);
13758}
13759
13760
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013761#ifdef __cplusplus
13762}
13763#endif