blob: 3ce845c3af3a84dafb5a70b1794a0edb68e5486f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
110 (assert(PyUnicode_Check(op)), \
111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
118 (assert(PyUnicode_Check(op)), \
119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
124#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op) \
129 (assert(PyUnicode_Check(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(PyUnicode_Check(op)), \
133 ((PyASCIIObject *)(op))->length)
134
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200135/* The Unicode string has been modified: reset the hash */
136#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200138
Walter Dörwald16807132007-05-25 13:52:07 +0000139/* This dictionary holds all interned unicode strings. Note that references
140 to strings in this dictionary are *not* counted in the string's ob_refcnt.
141 When the interned string reaches a refcnt of 0 the string deallocation
142 function will delete the reference from this dictionary.
143
144 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000145 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000146*/
147static PyObject *interned;
148
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000149/* The empty Unicode object is shared to improve performance. */
150static PyUnicodeObject *unicode_empty;
151
152/* Single character Unicode strings in the Latin-1 range are being
153 shared as well. */
154static PyUnicodeObject *unicode_latin1[256];
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Fast detection of the most frequent whitespace characters */
157const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000159/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000161/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* case 0x000C: * FORM FEED */
163/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 1, 1, 1, 1, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000166/* case 0x001C: * FILE SEPARATOR */
167/* case 0x001D: * GROUP SEPARATOR */
168/* case 0x001E: * RECORD SEPARATOR */
169/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 1, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
Alexander Belopolsky40018472011-02-26 01:02:56 +0000187static PyObject *
188unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000189 PyObject **errorHandler,const char *encoding, const char *reason,
190 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
191 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
192
Alexander Belopolsky40018472011-02-26 01:02:56 +0000193static void
194raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300195 const char *encoding,
196 const Py_UNICODE *unicode, Py_ssize_t size,
197 Py_ssize_t startpos, Py_ssize_t endpos,
198 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000199
Christian Heimes190d79e2008-01-30 11:58:22 +0000200/* Same for linebreaks */
201static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000204/* 0x000B, * LINE TABULATION */
205/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* 0x001C, * FILE SEPARATOR */
210/* 0x001D, * GROUP SEPARATOR */
211/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 1, 1, 1, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300228/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
229 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000230Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000231PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000233#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000235#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 /* This is actually an illegal character, so it should
237 not be passed to unichr. */
238 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000239#endif
240}
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242/* --- Bloom Filters ----------------------------------------------------- */
243
244/* stuff to implement simple "bloom filters" for Unicode characters.
245 to keep things simple, we use a single bitmask, using the least 5
246 bits from each unicode characters as the bit index. */
247
248/* the linebreak mask is set up by Unicode_Init below */
249
Antoine Pitrouf068f942010-01-13 14:19:12 +0000250#if LONG_BIT >= 128
251#define BLOOM_WIDTH 128
252#elif LONG_BIT >= 64
253#define BLOOM_WIDTH 64
254#elif LONG_BIT >= 32
255#define BLOOM_WIDTH 32
256#else
257#error "LONG_BIT is smaller than 32"
258#endif
259
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260#define BLOOM_MASK unsigned long
261
262static BLOOM_MASK bloom_linebreak;
263
Antoine Pitrouf068f942010-01-13 14:19:12 +0000264#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
265#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266
Benjamin Peterson29060642009-01-31 22:14:21 +0000267#define BLOOM_LINEBREAK(ch) \
268 ((ch) < 128U ? ascii_linebreak[(ch)] : \
269 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200272make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273{
274 /* calculate simple bloom-style bitmask for a given unicode string */
275
Antoine Pitrouf068f942010-01-13 14:19:12 +0000276 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000277 Py_ssize_t i;
278
279 mask = 0;
280 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
283 return mask;
284}
285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286#define BLOOM_MEMBER(mask, chr, str) \
287 (BLOOM(mask, chr) \
288 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000289
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290/* --- Unicode Object ----------------------------------------------------- */
291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200293fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
294
295Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
296 Py_ssize_t size, Py_UCS4 ch,
297 int direction)
298{
299 /* like wcschr, but doesn't stop at NULL characters */
300 Py_ssize_t i;
301 if (direction == 1) {
302 for(i = 0; i < size; i++)
303 if (PyUnicode_READ(kind, s, i) == ch)
304 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
305 }
306 else {
307 for(i = size-1; i >= 0; i--)
308 if (PyUnicode_READ(kind, s, i) == ch)
309 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
310 }
311 return NULL;
312}
313
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314static int
315unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317{
318 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 /* Resizing is only supported for old unicode objects. */
321 assert(!PyUnicode_IS_COMPACT(unicode));
322 assert(_PyUnicode_WSTR(unicode) != NULL);
323
324 /* ... and only if they have not been readied yet, because
325 callees usually rely on the wstr representation when resizing. */
326 assert(unicode->data.any == NULL);
327
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000328 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000330 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 /* Resizing shared object (unicode_empty or single character
333 objects) in-place is not allowed. Use PyUnicode_Resize()
334 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000335
Benjamin Peterson14339b62009-01-31 16:36:08 +0000336 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
338 _PyUnicode_WSTR(unicode)[0] < 256U &&
339 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000341 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 return -1;
343 }
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345 /* We allocate one more byte to make sure the string is Ux0000 terminated.
346 The overallocation is also used by fastsearch, which assumes that it's
347 safe to look at str[length] (without making any assumptions about what
348 it contains). */
349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 oldstr = _PyUnicode_WSTR(unicode);
351 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
352 sizeof(Py_UNICODE) * (length + 1));
353 if (!_PyUnicode_WSTR(unicode)) {
354 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 PyErr_NoMemory();
356 return -1;
357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_WSTR(unicode)[length] = 0;
359 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360
Benjamin Peterson29060642009-01-31 22:14:21 +0000361 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 if (unicode->data.any != NULL) {
363 PyObject_FREE(unicode->data.any);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200364 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
365 PyObject_FREE(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200367 _PyUnicode_UTF8(unicode) = NULL;
368 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 unicode->data.any = NULL;
370 _PyUnicode_LENGTH(unicode) = 0;
371 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
372 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200374 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000375
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return 0;
377}
378
379/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000380 Ux0000 terminated; some code (e.g. new_identifier)
381 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382
383 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385
386*/
387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388#ifdef Py_DEBUG
389int unicode_old_new_calls = 0;
390#endif
391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392static PyUnicodeObject *
393_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 if (length == 0 && unicode_empty != NULL) {
400 Py_INCREF(unicode_empty);
401 return unicode_empty;
402 }
403
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000404 /* Ensure we won't overflow the size. */
405 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
406 return (PyUnicodeObject *)PyErr_NoMemory();
407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 if (length < 0) {
409 PyErr_SetString(PyExc_SystemError,
410 "Negative size passed to _PyUnicode_New");
411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000412 }
413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414#ifdef Py_DEBUG
415 ++unicode_old_new_calls;
416#endif
417
418 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
419 if (unicode == NULL)
420 return NULL;
421 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
422 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
423 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyErr_NoMemory();
425 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427
Jeremy Hyltond8082792003-09-16 19:41:39 +0000428 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000429 * the caller fails before initializing str -- unicode_resize()
430 * reads str[0], and the Keep-Alive optimization can keep memory
431 * allocated for str alive across a call to unicode_dealloc(unicode).
432 * We don't want unicode_resize to read uninitialized memory in
433 * that case.
434 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435 _PyUnicode_WSTR(unicode)[0] = 0;
436 _PyUnicode_WSTR(unicode)[length] = 0;
437 _PyUnicode_WSTR_LENGTH(unicode) = length;
438 _PyUnicode_HASH(unicode) = -1;
439 _PyUnicode_STATE(unicode).interned = 0;
440 _PyUnicode_STATE(unicode).kind = 0;
441 _PyUnicode_STATE(unicode).compact = 0;
442 _PyUnicode_STATE(unicode).ready = 0;
443 _PyUnicode_STATE(unicode).ascii = 0;
444 unicode->data.any = NULL;
445 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200446 _PyUnicode_UTF8(unicode) = NULL;
447 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000451 /* XXX UNREF/NEWREF interface should be more symmetrical */
452 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000453 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000454 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456}
457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458#ifdef Py_DEBUG
459int unicode_new_new_calls = 0;
460
461/* Functions wrapping macros for use in debugger */
462char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200463 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464}
465
466void *_PyUnicode_compact_data(void *unicode) {
467 return _PyUnicode_COMPACT_DATA(unicode);
468}
469void *_PyUnicode_data(void *unicode){
470 printf("obj %p\n", unicode);
471 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
472 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
473 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
474 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
475 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
476 return PyUnicode_DATA(unicode);
477}
478#endif
479
480PyObject *
481PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
482{
483 PyObject *obj;
484 PyCompactUnicodeObject *unicode;
485 void *data;
486 int kind_state;
487 int is_sharing = 0, is_ascii = 0;
488 Py_ssize_t char_size;
489 Py_ssize_t struct_size;
490
491 /* Optimization for empty strings */
492 if (size == 0 && unicode_empty != NULL) {
493 Py_INCREF(unicode_empty);
494 return (PyObject *)unicode_empty;
495 }
496
497#ifdef Py_DEBUG
498 ++unicode_new_new_calls;
499#endif
500
501 struct_size = sizeof(PyCompactUnicodeObject);
502 if (maxchar < 128) {
503 kind_state = PyUnicode_1BYTE_KIND;
504 char_size = 1;
505 is_ascii = 1;
506 struct_size = sizeof(PyASCIIObject);
507 }
508 else if (maxchar < 256) {
509 kind_state = PyUnicode_1BYTE_KIND;
510 char_size = 1;
511 }
512 else if (maxchar < 65536) {
513 kind_state = PyUnicode_2BYTE_KIND;
514 char_size = 2;
515 if (sizeof(wchar_t) == 2)
516 is_sharing = 1;
517 }
518 else {
519 kind_state = PyUnicode_4BYTE_KIND;
520 char_size = 4;
521 if (sizeof(wchar_t) == 4)
522 is_sharing = 1;
523 }
524
525 /* Ensure we won't overflow the size. */
526 if (size < 0) {
527 PyErr_SetString(PyExc_SystemError,
528 "Negative size passed to PyUnicode_New");
529 return NULL;
530 }
531 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
532 return PyErr_NoMemory();
533
534 /* Duplicated allocation code from _PyObject_New() instead of a call to
535 * PyObject_New() so we are able to allocate space for the object and
536 * it's data buffer.
537 */
538 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
539 if (obj == NULL)
540 return PyErr_NoMemory();
541 obj = PyObject_INIT(obj, &PyUnicode_Type);
542 if (obj == NULL)
543 return NULL;
544
545 unicode = (PyCompactUnicodeObject *)obj;
546 if (is_ascii)
547 data = ((PyASCIIObject*)obj) + 1;
548 else
549 data = unicode + 1;
550 _PyUnicode_LENGTH(unicode) = size;
551 _PyUnicode_HASH(unicode) = -1;
552 _PyUnicode_STATE(unicode).interned = 0;
553 _PyUnicode_STATE(unicode).kind = kind_state;
554 _PyUnicode_STATE(unicode).compact = 1;
555 _PyUnicode_STATE(unicode).ready = 1;
556 _PyUnicode_STATE(unicode).ascii = is_ascii;
557 if (is_ascii) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 }
561 else if (kind_state == PyUnicode_1BYTE_KIND) {
562 ((char*)data)[size] = 0;
563 _PyUnicode_WSTR(unicode) = NULL;
564 _PyUnicode_WSTR_LENGTH(unicode) = 0;
565 unicode->utf8_length = 0;
566 unicode->utf8 = NULL;
567 }
568 else {
569 unicode->utf8 = NULL;
570 if (kind_state == PyUnicode_2BYTE_KIND)
571 ((Py_UCS2*)data)[size] = 0;
572 else /* kind_state == PyUnicode_4BYTE_KIND */
573 ((Py_UCS4*)data)[size] = 0;
574 if (is_sharing) {
575 _PyUnicode_WSTR_LENGTH(unicode) = size;
576 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
577 }
578 else {
579 _PyUnicode_WSTR_LENGTH(unicode) = 0;
580 _PyUnicode_WSTR(unicode) = NULL;
581 }
582 }
583 return obj;
584}
585
586#if SIZEOF_WCHAR_T == 2
587/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
588 will decode surrogate pairs, the other conversions are implemented as macros
589 for efficency.
590
591 This function assumes that unicode can hold one more code point than wstr
592 characters for a terminating null character. */
593static int
594unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
595 PyUnicodeObject *unicode)
596{
597 const wchar_t *iter;
598 Py_UCS4 *ucs4_out;
599
600 assert(unicode && PyUnicode_Check(unicode));
601 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
602 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
603
604 for (iter = begin; iter < end; ) {
605 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
606 _PyUnicode_GET_LENGTH(unicode)));
607 if (*iter >= 0xD800 && *iter <= 0xDBFF
608 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
609 {
610 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
611 iter += 2;
612 }
613 else {
614 *ucs4_out++ = *iter;
615 iter++;
616 }
617 }
618 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
619 _PyUnicode_GET_LENGTH(unicode)));
620
621 return 0;
622}
623#endif
624
Victor Stinnercd9950f2011-10-02 00:34:53 +0200625static int
626_PyUnicode_Dirty(PyObject *unicode)
627{
628 assert(PyUnicode_Check(unicode));
629 if (Py_REFCNT(unicode) != 1) {
630 PyErr_SetString(PyExc_ValueError,
631 "Cannot modify a string having more than 1 reference");
632 return -1;
633 }
634 _PyUnicode_DIRTY(unicode);
635 return 0;
636}
637
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200639PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
640 PyObject *from, Py_ssize_t from_start,
641 Py_ssize_t how_many)
642{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200643 unsigned int from_kind, to_kind;
644 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Victor Stinnerb1536152011-09-30 02:26:10 +0200646 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
647 PyErr_BadInternalCall();
648 return -1;
649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200650
651 if (PyUnicode_READY(from))
652 return -1;
653 if (PyUnicode_READY(to))
654 return -1;
655
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200656 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200657 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
658 PyErr_Format(PyExc_ValueError,
659 "Cannot write %zi characters at %zi "
660 "in a string of %zi characters",
661 how_many, to_start, PyUnicode_GET_LENGTH(to));
662 return -1;
663 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200664 if (how_many == 0)
665 return 0;
666
Victor Stinnercd9950f2011-10-02 00:34:53 +0200667 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200668 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200670 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200671 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200673 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200674
675 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200676 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200677 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200678 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200679 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200680 + PyUnicode_KIND_SIZE(from_kind, from_start),
681 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200682 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200683 else if (from_kind == PyUnicode_1BYTE_KIND
684 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200685 {
686 _PyUnicode_CONVERT_BYTES(
687 Py_UCS1, Py_UCS2,
688 PyUnicode_1BYTE_DATA(from) + from_start,
689 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
690 PyUnicode_2BYTE_DATA(to) + to_start
691 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200692 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200693 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200694 && to_kind == PyUnicode_4BYTE_KIND)
695 {
696 _PyUnicode_CONVERT_BYTES(
697 Py_UCS1, Py_UCS4,
698 PyUnicode_1BYTE_DATA(from) + from_start,
699 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
700 PyUnicode_4BYTE_DATA(to) + to_start
701 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200702 }
703 else if (from_kind == PyUnicode_2BYTE_KIND
704 && to_kind == PyUnicode_4BYTE_KIND)
705 {
706 _PyUnicode_CONVERT_BYTES(
707 Py_UCS2, Py_UCS4,
708 PyUnicode_2BYTE_DATA(from) + from_start,
709 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
710 PyUnicode_4BYTE_DATA(to) + to_start
711 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200712 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200713 else {
714 int invalid_kinds;
715 if (from_kind > to_kind) {
716 /* slow path to check for character overflow */
717 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
718 Py_UCS4 ch, maxchar;
719 Py_ssize_t i;
720
721 maxchar = 0;
722 invalid_kinds = 0;
723 for (i=0; i < how_many; i++) {
724 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
725 if (ch > maxchar) {
726 maxchar = ch;
727 if (maxchar > to_maxchar) {
728 invalid_kinds = 1;
729 break;
730 }
731 }
732 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
733 }
734 }
735 else
736 invalid_kinds = 1;
737 if (invalid_kinds) {
738 PyErr_Format(PyExc_ValueError,
739 "Cannot copy UCS%u characters "
740 "into a string of UCS%u characters",
741 1 << (from_kind - 1),
742 1 << (to_kind -1));
743 return -1;
744 }
745 }
746 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747}
748
Victor Stinner17222162011-09-28 22:15:37 +0200749/* Find the maximum code point and count the number of surrogate pairs so a
750 correct string length can be computed before converting a string to UCS4.
751 This function counts single surrogates as a character and not as a pair.
752
753 Return 0 on success, or -1 on error. */
754static int
755find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
756 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757{
758 const wchar_t *iter;
759
760 if (num_surrogates == NULL || maxchar == NULL) {
761 PyErr_SetString(PyExc_SystemError,
762 "unexpected NULL arguments to "
763 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
764 return -1;
765 }
766
767 *num_surrogates = 0;
768 *maxchar = 0;
769
770 for (iter = begin; iter < end; ) {
771 if (*iter > *maxchar)
772 *maxchar = *iter;
773#if SIZEOF_WCHAR_T == 2
774 if (*iter >= 0xD800 && *iter <= 0xDBFF
775 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
776 {
777 Py_UCS4 surrogate_val;
778 surrogate_val = (((iter[0] & 0x3FF)<<10)
779 | (iter[1] & 0x3FF)) + 0x10000;
780 ++(*num_surrogates);
781 if (surrogate_val > *maxchar)
782 *maxchar = surrogate_val;
783 iter += 2;
784 }
785 else
786 iter++;
787#else
788 iter++;
789#endif
790 }
791 return 0;
792}
793
794#ifdef Py_DEBUG
795int unicode_ready_calls = 0;
796#endif
797
798int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200799_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200801 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 wchar_t *end;
803 Py_UCS4 maxchar = 0;
804 Py_ssize_t num_surrogates;
805#if SIZEOF_WCHAR_T == 2
806 Py_ssize_t length_wo_surrogates;
807#endif
808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200810 strings were created using _PyObject_New() and where no canonical
811 representation (the str field) has been set yet aka strings
812 which are not yet ready. */
813 assert(PyUnicode_Check(obj));
814 assert(!PyUnicode_IS_READY(obj));
815 assert(!PyUnicode_IS_COMPACT(obj));
816 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200818 assert(unicode->data.any == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200819 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200820 /* Actually, it should neither be interned nor be anything else: */
821 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822
823#ifdef Py_DEBUG
824 ++unicode_ready_calls;
825#endif
826
827 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200828 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200829 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831
832 if (maxchar < 256) {
833 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
834 if (!unicode->data.any) {
835 PyErr_NoMemory();
836 return -1;
837 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200838 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839 _PyUnicode_WSTR(unicode), end,
840 PyUnicode_1BYTE_DATA(unicode));
841 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
842 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
843 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
844 if (maxchar < 128) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200845 _PyUnicode_UTF8(unicode) = unicode->data.any;
846 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 }
848 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200849 _PyUnicode_UTF8(unicode) = NULL;
850 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 }
852 PyObject_FREE(_PyUnicode_WSTR(unicode));
853 _PyUnicode_WSTR(unicode) = NULL;
854 _PyUnicode_WSTR_LENGTH(unicode) = 0;
855 }
856 /* In this case we might have to convert down from 4-byte native
857 wchar_t to 2-byte unicode. */
858 else if (maxchar < 65536) {
859 assert(num_surrogates == 0 &&
860 "FindMaxCharAndNumSurrogatePairs() messed up");
861
Victor Stinner506f5922011-09-28 22:34:18 +0200862#if SIZEOF_WCHAR_T == 2
863 /* We can share representations and are done. */
864 unicode->data.any = _PyUnicode_WSTR(unicode);
865 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
866 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
867 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200868 _PyUnicode_UTF8(unicode) = NULL;
869 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200870#else
871 /* sizeof(wchar_t) == 4 */
872 unicode->data.any = PyObject_MALLOC(
873 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
874 if (!unicode->data.any) {
875 PyErr_NoMemory();
876 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 }
Victor Stinner506f5922011-09-28 22:34:18 +0200878 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
879 _PyUnicode_WSTR(unicode), end,
880 PyUnicode_2BYTE_DATA(unicode));
881 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
882 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
883 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200884 _PyUnicode_UTF8(unicode) = NULL;
885 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200886 PyObject_FREE(_PyUnicode_WSTR(unicode));
887 _PyUnicode_WSTR(unicode) = NULL;
888 _PyUnicode_WSTR_LENGTH(unicode) = 0;
889#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 }
891 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
892 else {
893#if SIZEOF_WCHAR_T == 2
894 /* in case the native representation is 2-bytes, we need to allocate a
895 new normalized 4-byte version. */
896 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
897 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
898 if (!unicode->data.any) {
899 PyErr_NoMemory();
900 return -1;
901 }
902 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
903 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200904 _PyUnicode_UTF8(unicode) = NULL;
905 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
907 unicode) < 0) {
908 assert(0 && "ConvertWideCharToUCS4 failed");
909 return -1;
910 }
911 PyObject_FREE(_PyUnicode_WSTR(unicode));
912 _PyUnicode_WSTR(unicode) = NULL;
913 _PyUnicode_WSTR_LENGTH(unicode) = 0;
914#else
915 assert(num_surrogates == 0);
916
917 unicode->data.any = _PyUnicode_WSTR(unicode);
918 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200919 _PyUnicode_UTF8(unicode) = NULL;
920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
922#endif
923 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
924 }
925 _PyUnicode_STATE(unicode).ready = 1;
926 return 0;
927}
928
Alexander Belopolsky40018472011-02-26 01:02:56 +0000929static void
930unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000931{
Walter Dörwald16807132007-05-25 13:52:07 +0000932 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000933 case SSTATE_NOT_INTERNED:
934 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000935
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 case SSTATE_INTERNED_MORTAL:
937 /* revive dead object temporarily for DelItem */
938 Py_REFCNT(unicode) = 3;
939 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
940 Py_FatalError(
941 "deletion of interned string failed");
942 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000943
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 case SSTATE_INTERNED_IMMORTAL:
945 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000946
Benjamin Peterson29060642009-01-31 22:14:21 +0000947 default:
948 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000949 }
950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 if (_PyUnicode_WSTR(unicode) &&
952 (!PyUnicode_IS_READY(unicode) ||
953 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
954 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200955 if (!PyUnicode_IS_COMPACT_ASCII(unicode)
956 && _PyUnicode_UTF8(unicode)
957 && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
958 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959
960 if (PyUnicode_IS_COMPACT(unicode)) {
961 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962 }
963 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 if (unicode->data.any)
965 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000966 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000967 }
968}
969
Alexander Belopolsky40018472011-02-26 01:02:56 +0000970static int
971_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000972{
973 register PyUnicodeObject *v;
974
975 /* Argument checks */
976 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000977 PyErr_BadInternalCall();
978 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000979 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000980 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
982 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000983 PyErr_BadInternalCall();
984 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000985 }
986
987 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 possible since these are being shared.
989 The same goes for new-representation unicode objects or objects which
990 have already been readied.
991 For these, we simply return a fresh copy with the same Unicode content.
992 */
993 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
994 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
995 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000996 PyUnicodeObject *w = _PyUnicode_New(length);
997 if (w == NULL)
998 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
1000 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +00001001 Py_DECREF(*unicode);
1002 *unicode = w;
1003 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001004 }
1005
1006 /* Note that we don't have to modify *unicode for unshared Unicode
1007 objects, since we can modify them in-place. */
1008 return unicode_resize(v, length);
1009}
1010
Alexander Belopolsky40018472011-02-26 01:02:56 +00001011int
1012PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001013{
1014 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1015}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017static PyObject*
1018get_latin1_char(unsigned char ch)
1019{
1020 PyUnicodeObject *unicode = unicode_latin1[ch];
1021 if (!unicode) {
1022 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1023 if (!unicode)
1024 return NULL;
1025 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1026 unicode_latin1[ch] = unicode;
1027 }
1028 Py_INCREF(unicode);
1029 return (PyObject *)unicode;
1030}
1031
Alexander Belopolsky40018472011-02-26 01:02:56 +00001032PyObject *
1033PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 Py_UCS4 maxchar = 0;
1037 Py_ssize_t num_surrogates;
1038
1039 if (u == NULL)
1040 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001042 /* If the Unicode data is known at construction time, we can apply
1043 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 /* Optimization for empty strings */
1046 if (size == 0 && unicode_empty != NULL) {
1047 Py_INCREF(unicode_empty);
1048 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001049 }
Tim Petersced69f82003-09-16 20:30:58 +00001050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 /* Single character Unicode objects in the Latin-1 range are
1052 shared when using this constructor */
1053 if (size == 1 && *u < 256)
1054 return get_latin1_char((unsigned char)*u);
1055
1056 /* If not empty and not single character, copy the Unicode data
1057 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001058 if (find_maxchar_surrogates(u, u + size,
1059 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 return NULL;
1061
1062 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1063 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 if (!unicode)
1065 return NULL;
1066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 switch (PyUnicode_KIND(unicode)) {
1068 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001069 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1071 break;
1072 case PyUnicode_2BYTE_KIND:
1073#if Py_UNICODE_SIZE == 2
1074 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1075#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001076 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1078#endif
1079 break;
1080 case PyUnicode_4BYTE_KIND:
1081#if SIZEOF_WCHAR_T == 2
1082 /* This is the only case which has to process surrogates, thus
1083 a simple copy loop is not enough and we need a function. */
1084 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1085 Py_DECREF(unicode);
1086 return NULL;
1087 }
1088#else
1089 assert(num_surrogates == 0);
1090 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1091#endif
1092 break;
1093 default:
1094 assert(0 && "Impossible state");
1095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096
1097 return (PyObject *)unicode;
1098}
1099
Alexander Belopolsky40018472011-02-26 01:02:56 +00001100PyObject *
1101PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102{
1103 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001104
Benjamin Peterson14339b62009-01-31 16:36:08 +00001105 if (size < 0) {
1106 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 return NULL;
1109 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001110
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001111 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001112 some optimizations which share commonly used objects.
1113 Also, this means the input must be UTF-8, so fall back to the
1114 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001115 if (u != NULL) {
1116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 /* Optimization for empty strings */
1118 if (size == 0 && unicode_empty != NULL) {
1119 Py_INCREF(unicode_empty);
1120 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001121 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001122
1123 /* Single characters are shared when using this constructor.
1124 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 if (size == 1 && Py_CHARMASK(*u) < 128)
1126 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001127
1128 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001129 }
1130
Walter Dörwald55507312007-05-18 13:12:10 +00001131 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001132 if (!unicode)
1133 return NULL;
1134
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001135 return (PyObject *)unicode;
1136}
1137
Alexander Belopolsky40018472011-02-26 01:02:56 +00001138PyObject *
1139PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001140{
1141 size_t size = strlen(u);
1142 if (size > PY_SSIZE_T_MAX) {
1143 PyErr_SetString(PyExc_OverflowError, "input too long");
1144 return NULL;
1145 }
1146
1147 return PyUnicode_FromStringAndSize(u, size);
1148}
1149
Victor Stinnere57b1c02011-09-28 22:20:48 +02001150static PyObject*
1151_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 PyObject *res;
1154 unsigned char max = 127;
1155 Py_ssize_t i;
1156 for (i = 0; i < size; i++) {
1157 if (u[i] & 0x80) {
1158 max = 255;
1159 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001160 }
1161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 res = PyUnicode_New(size, max);
1163 if (!res)
1164 return NULL;
1165 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1166 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001167}
1168
Victor Stinnere57b1c02011-09-28 22:20:48 +02001169static PyObject*
1170_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171{
1172 PyObject *res;
1173 Py_UCS2 max = 0;
1174 Py_ssize_t i;
1175 for (i = 0; i < size; i++)
1176 if (u[i] > max)
1177 max = u[i];
1178 res = PyUnicode_New(size, max);
1179 if (!res)
1180 return NULL;
1181 if (max >= 256)
1182 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1183 else
1184 for (i = 0; i < size; i++)
1185 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1186 return res;
1187}
1188
Victor Stinnere57b1c02011-09-28 22:20:48 +02001189static PyObject*
1190_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191{
1192 PyObject *res;
1193 Py_UCS4 max = 0;
1194 Py_ssize_t i;
1195 for (i = 0; i < size; i++)
1196 if (u[i] > max)
1197 max = u[i];
1198 res = PyUnicode_New(size, max);
1199 if (!res)
1200 return NULL;
1201 if (max >= 0x10000)
1202 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1203 else {
1204 int kind = PyUnicode_KIND(res);
1205 void *data = PyUnicode_DATA(res);
1206 for (i = 0; i < size; i++)
1207 PyUnicode_WRITE(kind, data, i, u[i]);
1208 }
1209 return res;
1210}
1211
1212PyObject*
1213PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1214{
1215 switch(kind) {
1216 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001217 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001219 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001221 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001223 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 return NULL;
1225}
1226
Victor Stinner034f6cf2011-09-30 02:26:44 +02001227PyObject*
1228PyUnicode_Copy(PyObject *unicode)
1229{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001230 Py_ssize_t size;
1231 PyObject *copy;
1232 void *data;
1233
Victor Stinner034f6cf2011-09-30 02:26:44 +02001234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadInternalCall();
1236 return NULL;
1237 }
1238 if (PyUnicode_READY(unicode))
1239 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001240
1241 size = PyUnicode_GET_LENGTH(unicode);
1242 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1243 if (!copy)
1244 return NULL;
1245 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1246
1247 data = PyUnicode_DATA(unicode);
1248 switch (PyUnicode_KIND(unicode))
1249 {
1250 case PyUnicode_1BYTE_KIND:
1251 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1252 break;
1253 case PyUnicode_2BYTE_KIND:
1254 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1255 break;
1256 case PyUnicode_4BYTE_KIND:
1257 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1258 break;
1259 default:
1260 assert(0);
1261 break;
1262 }
1263 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001264}
1265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266
1267/* Widen Unicode objects to larger buffers.
1268 Return NULL if the string is too wide already. */
1269
1270void*
1271_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1272{
1273 Py_ssize_t i;
1274 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1275 void *d = PyUnicode_DATA(s);
1276 unsigned int skind = PyUnicode_KIND(s);
1277 if (PyUnicode_KIND(s) >= kind) {
1278 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1279 return NULL;
1280 }
1281 switch(kind) {
1282 case PyUnicode_2BYTE_KIND: {
1283 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1284 if (!result) {
1285 PyErr_NoMemory();
1286 return 0;
1287 }
1288 for (i = 0; i < len; i++)
1289 result[i] = ((Py_UCS1*)d)[i];
1290 return result;
1291 }
1292 case PyUnicode_4BYTE_KIND: {
1293 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1294 if (!result) {
1295 PyErr_NoMemory();
1296 return 0;
1297 }
1298 for (i = 0; i < len; i++)
1299 result[i] = PyUnicode_READ(skind, d, i);
1300 return result;
1301 }
1302 }
1303 Py_FatalError("invalid kind");
1304 return NULL;
1305}
1306
1307static Py_UCS4*
1308as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1309 int copy_null)
1310{
1311 int kind;
1312 void *data;
1313 Py_ssize_t len, targetlen;
1314 if (PyUnicode_READY(string) == -1)
1315 return NULL;
1316 kind = PyUnicode_KIND(string);
1317 data = PyUnicode_DATA(string);
1318 len = PyUnicode_GET_LENGTH(string);
1319 targetlen = len;
1320 if (copy_null)
1321 targetlen++;
1322 if (!target) {
1323 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1324 PyErr_NoMemory();
1325 return NULL;
1326 }
1327 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1328 if (!target) {
1329 PyErr_NoMemory();
1330 return NULL;
1331 }
1332 }
1333 else {
1334 if (targetsize < targetlen) {
1335 PyErr_Format(PyExc_SystemError,
1336 "string is longer than the buffer");
1337 if (copy_null && 0 < targetsize)
1338 target[0] = 0;
1339 return NULL;
1340 }
1341 }
1342 if (kind != PyUnicode_4BYTE_KIND) {
1343 Py_ssize_t i;
1344 for (i = 0; i < len; i++)
1345 target[i] = PyUnicode_READ(kind, data, i);
1346 }
1347 else
1348 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1349 if (copy_null)
1350 target[len] = 0;
1351 return target;
1352}
1353
1354Py_UCS4*
1355PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1356 int copy_null)
1357{
1358 if (target == NULL || targetsize < 1) {
1359 PyErr_BadInternalCall();
1360 return NULL;
1361 }
1362 return as_ucs4(string, target, targetsize, copy_null);
1363}
1364
1365Py_UCS4*
1366PyUnicode_AsUCS4Copy(PyObject *string)
1367{
1368 return as_ucs4(string, NULL, 0, 1);
1369}
1370
1371#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001372
Alexander Belopolsky40018472011-02-26 01:02:56 +00001373PyObject *
1374PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001377 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001379 PyErr_BadInternalCall();
1380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 }
1382
Martin v. Löwis790465f2008-04-05 20:41:37 +00001383 if (size == -1) {
1384 size = wcslen(w);
1385 }
1386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388}
1389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001391
Walter Dörwald346737f2007-05-31 10:44:43 +00001392static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001393makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1394 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001395{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001396 *fmt++ = '%';
1397 if (width) {
1398 if (zeropad)
1399 *fmt++ = '0';
1400 fmt += sprintf(fmt, "%d", width);
1401 }
1402 if (precision)
1403 fmt += sprintf(fmt, ".%d", precision);
1404 if (longflag)
1405 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001406 else if (longlongflag) {
1407 /* longlongflag should only ever be nonzero on machines with
1408 HAVE_LONG_LONG defined */
1409#ifdef HAVE_LONG_LONG
1410 char *f = PY_FORMAT_LONG_LONG;
1411 while (*f)
1412 *fmt++ = *f++;
1413#else
1414 /* we shouldn't ever get here */
1415 assert(0);
1416 *fmt++ = 'l';
1417#endif
1418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001419 else if (size_tflag) {
1420 char *f = PY_FORMAT_SIZE_T;
1421 while (*f)
1422 *fmt++ = *f++;
1423 }
1424 *fmt++ = c;
1425 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001426}
1427
Victor Stinner96865452011-03-01 23:44:09 +00001428/* helper for PyUnicode_FromFormatV() */
1429
1430static const char*
1431parse_format_flags(const char *f,
1432 int *p_width, int *p_precision,
1433 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1434{
1435 int width, precision, longflag, longlongflag, size_tflag;
1436
1437 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1438 f++;
1439 width = 0;
1440 while (Py_ISDIGIT((unsigned)*f))
1441 width = (width*10) + *f++ - '0';
1442 precision = 0;
1443 if (*f == '.') {
1444 f++;
1445 while (Py_ISDIGIT((unsigned)*f))
1446 precision = (precision*10) + *f++ - '0';
1447 if (*f == '%') {
1448 /* "%.3%s" => f points to "3" */
1449 f--;
1450 }
1451 }
1452 if (*f == '\0') {
1453 /* bogus format "%.1" => go backward, f points to "1" */
1454 f--;
1455 }
1456 if (p_width != NULL)
1457 *p_width = width;
1458 if (p_precision != NULL)
1459 *p_precision = precision;
1460
1461 /* Handle %ld, %lu, %lld and %llu. */
1462 longflag = 0;
1463 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001464 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001465
1466 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001467 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001468 longflag = 1;
1469 ++f;
1470 }
1471#ifdef HAVE_LONG_LONG
1472 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001473 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001474 longlongflag = 1;
1475 f += 2;
1476 }
1477#endif
1478 }
1479 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001480 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001481 size_tflag = 1;
1482 ++f;
1483 }
1484 if (p_longflag != NULL)
1485 *p_longflag = longflag;
1486 if (p_longlongflag != NULL)
1487 *p_longlongflag = longlongflag;
1488 if (p_size_tflag != NULL)
1489 *p_size_tflag = size_tflag;
1490 return f;
1491}
1492
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001493/* maximum number of characters required for output of %ld. 21 characters
1494 allows for 64-bit integers (in decimal) and an optional sign. */
1495#define MAX_LONG_CHARS 21
1496/* maximum number of characters required for output of %lld.
1497 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1498 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1499#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1500
Walter Dörwaldd2034312007-05-18 16:29:38 +00001501PyObject *
1502PyUnicode_FromFormatV(const char *format, va_list vargs)
1503{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001504 va_list count;
1505 Py_ssize_t callcount = 0;
1506 PyObject **callresults = NULL;
1507 PyObject **callresult = NULL;
1508 Py_ssize_t n = 0;
1509 int width = 0;
1510 int precision = 0;
1511 int zeropad;
1512 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001514 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001515 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1517 Py_UCS4 argmaxchar;
1518 Py_ssize_t numbersize = 0;
1519 char *numberresults = NULL;
1520 char *numberresult = NULL;
1521 Py_ssize_t i;
1522 int kind;
1523 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001524
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001525 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001526 /* step 1: count the number of %S/%R/%A/%s format specifications
1527 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1528 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 * result in an array)
1530 * also esimate a upper bound for all the number formats in the string,
1531 * numbers will be formated in step 3 and be keept in a '\0'-separated
1532 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001533 for (f = format; *f; f++) {
1534 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001535 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1537 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1538 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1539 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001541 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001542#ifdef HAVE_LONG_LONG
1543 if (longlongflag) {
1544 if (width < MAX_LONG_LONG_CHARS)
1545 width = MAX_LONG_LONG_CHARS;
1546 }
1547 else
1548#endif
1549 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1550 including sign. Decimal takes the most space. This
1551 isn't enough for octal. If a width is specified we
1552 need more (which we allocate later). */
1553 if (width < MAX_LONG_CHARS)
1554 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555
1556 /* account for the size + '\0' to separate numbers
1557 inside of the numberresults buffer */
1558 numbersize += (width + 1);
1559 }
1560 }
1561 else if ((unsigned char)*f > 127) {
1562 PyErr_Format(PyExc_ValueError,
1563 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1564 "string, got a non-ASCII byte: 0x%02x",
1565 (unsigned char)*f);
1566 return NULL;
1567 }
1568 }
1569 /* step 2: allocate memory for the results of
1570 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1571 if (callcount) {
1572 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1573 if (!callresults) {
1574 PyErr_NoMemory();
1575 return NULL;
1576 }
1577 callresult = callresults;
1578 }
1579 /* step 2.5: allocate memory for the results of formating numbers */
1580 if (numbersize) {
1581 numberresults = PyObject_Malloc(numbersize);
1582 if (!numberresults) {
1583 PyErr_NoMemory();
1584 goto fail;
1585 }
1586 numberresult = numberresults;
1587 }
1588
1589 /* step 3: format numbers and figure out how large a buffer we need */
1590 for (f = format; *f; f++) {
1591 if (*f == '%') {
1592 const char* p;
1593 int longflag;
1594 int longlongflag;
1595 int size_tflag;
1596 int numprinted;
1597
1598 p = f;
1599 zeropad = (f[1] == '0');
1600 f = parse_format_flags(f, &width, &precision,
1601 &longflag, &longlongflag, &size_tflag);
1602 switch (*f) {
1603 case 'c':
1604 {
1605 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001606 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 n++;
1608 break;
1609 }
1610 case '%':
1611 n++;
1612 break;
1613 case 'i':
1614 case 'd':
1615 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1616 width, precision, *f);
1617 if (longflag)
1618 numprinted = sprintf(numberresult, fmt,
1619 va_arg(count, long));
1620#ifdef HAVE_LONG_LONG
1621 else if (longlongflag)
1622 numprinted = sprintf(numberresult, fmt,
1623 va_arg(count, PY_LONG_LONG));
1624#endif
1625 else if (size_tflag)
1626 numprinted = sprintf(numberresult, fmt,
1627 va_arg(count, Py_ssize_t));
1628 else
1629 numprinted = sprintf(numberresult, fmt,
1630 va_arg(count, int));
1631 n += numprinted;
1632 /* advance by +1 to skip over the '\0' */
1633 numberresult += (numprinted + 1);
1634 assert(*(numberresult - 1) == '\0');
1635 assert(*(numberresult - 2) != '\0');
1636 assert(numprinted >= 0);
1637 assert(numberresult <= numberresults + numbersize);
1638 break;
1639 case 'u':
1640 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1641 width, precision, 'u');
1642 if (longflag)
1643 numprinted = sprintf(numberresult, fmt,
1644 va_arg(count, unsigned long));
1645#ifdef HAVE_LONG_LONG
1646 else if (longlongflag)
1647 numprinted = sprintf(numberresult, fmt,
1648 va_arg(count, unsigned PY_LONG_LONG));
1649#endif
1650 else if (size_tflag)
1651 numprinted = sprintf(numberresult, fmt,
1652 va_arg(count, size_t));
1653 else
1654 numprinted = sprintf(numberresult, fmt,
1655 va_arg(count, unsigned int));
1656 n += numprinted;
1657 numberresult += (numprinted + 1);
1658 assert(*(numberresult - 1) == '\0');
1659 assert(*(numberresult - 2) != '\0');
1660 assert(numprinted >= 0);
1661 assert(numberresult <= numberresults + numbersize);
1662 break;
1663 case 'x':
1664 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1665 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1666 n += numprinted;
1667 numberresult += (numprinted + 1);
1668 assert(*(numberresult - 1) == '\0');
1669 assert(*(numberresult - 2) != '\0');
1670 assert(numprinted >= 0);
1671 assert(numberresult <= numberresults + numbersize);
1672 break;
1673 case 'p':
1674 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1675 /* %p is ill-defined: ensure leading 0x. */
1676 if (numberresult[1] == 'X')
1677 numberresult[1] = 'x';
1678 else if (numberresult[1] != 'x') {
1679 memmove(numberresult + 2, numberresult,
1680 strlen(numberresult) + 1);
1681 numberresult[0] = '0';
1682 numberresult[1] = 'x';
1683 numprinted += 2;
1684 }
1685 n += numprinted;
1686 numberresult += (numprinted + 1);
1687 assert(*(numberresult - 1) == '\0');
1688 assert(*(numberresult - 2) != '\0');
1689 assert(numprinted >= 0);
1690 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001691 break;
1692 case 's':
1693 {
1694 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001695 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001696 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1697 if (!str)
1698 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 /* since PyUnicode_DecodeUTF8 returns already flexible
1700 unicode objects, there is no need to call ready on them */
1701 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001702 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001703 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001704 /* Remember the str and switch to the next slot */
1705 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001706 break;
1707 }
1708 case 'U':
1709 {
1710 PyObject *obj = va_arg(count, PyObject *);
1711 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 if (PyUnicode_READY(obj) == -1)
1713 goto fail;
1714 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001715 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001717 break;
1718 }
1719 case 'V':
1720 {
1721 PyObject *obj = va_arg(count, PyObject *);
1722 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001723 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 assert(obj || str);
1725 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001726 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 if (PyUnicode_READY(obj) == -1)
1728 goto fail;
1729 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001730 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001732 *callresult++ = NULL;
1733 }
1734 else {
1735 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1736 if (!str_obj)
1737 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001739 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001741 *callresult++ = str_obj;
1742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001743 break;
1744 }
1745 case 'S':
1746 {
1747 PyObject *obj = va_arg(count, PyObject *);
1748 PyObject *str;
1749 assert(obj);
1750 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001752 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001754 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001756 /* Remember the str and switch to the next slot */
1757 *callresult++ = str;
1758 break;
1759 }
1760 case 'R':
1761 {
1762 PyObject *obj = va_arg(count, PyObject *);
1763 PyObject *repr;
1764 assert(obj);
1765 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001769 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001771 /* Remember the repr and switch to the next slot */
1772 *callresult++ = repr;
1773 break;
1774 }
1775 case 'A':
1776 {
1777 PyObject *obj = va_arg(count, PyObject *);
1778 PyObject *ascii;
1779 assert(obj);
1780 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001782 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001784 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001786 /* Remember the repr and switch to the next slot */
1787 *callresult++ = ascii;
1788 break;
1789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001790 default:
1791 /* if we stumble upon an unknown
1792 formatting code, copy the rest of
1793 the format string to the output
1794 string. (we cannot just skip the
1795 code, since there's no way to know
1796 what's in the argument list) */
1797 n += strlen(p);
1798 goto expand;
1799 }
1800 } else
1801 n++;
1802 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001803 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001806 we don't have to resize the string.
1807 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001809 if (!string)
1810 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 kind = PyUnicode_KIND(string);
1812 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001813 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001818 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001819
1820 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1822 /* checking for == because the last argument could be a empty
1823 string, which causes i to point to end, the assert at the end of
1824 the loop */
1825 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001826
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 switch (*f) {
1828 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001829 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 const int ordinal = va_arg(vargs, int);
1831 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001833 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001834 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001835 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001837 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 case 'p':
1839 /* unused, since we already have the result */
1840 if (*f == 'p')
1841 (void) va_arg(vargs, void *);
1842 else
1843 (void) va_arg(vargs, int);
1844 /* extract the result from numberresults and append. */
1845 for (; *numberresult; ++i, ++numberresult)
1846 PyUnicode_WRITE(kind, data, i, *numberresult);
1847 /* skip over the separating '\0' */
1848 assert(*numberresult == '\0');
1849 numberresult++;
1850 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001851 break;
1852 case 's':
1853 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001854 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001856 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 size = PyUnicode_GET_LENGTH(*callresult);
1858 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001859 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1860 *callresult, 0,
1861 size) < 0)
1862 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001864 /* We're done with the unicode()/repr() => forget it */
1865 Py_DECREF(*callresult);
1866 /* switch to next unicode()/repr() result */
1867 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 break;
1869 }
1870 case 'U':
1871 {
1872 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 Py_ssize_t size;
1874 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1875 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001876 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1877 obj, 0,
1878 size) < 0)
1879 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001881 break;
1882 }
1883 case 'V':
1884 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001886 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001887 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 size = PyUnicode_GET_LENGTH(obj);
1890 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001891 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1892 obj, 0,
1893 size) < 0)
1894 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001896 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 size = PyUnicode_GET_LENGTH(*callresult);
1898 assert(PyUnicode_KIND(*callresult) <=
1899 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001900 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1901 *callresult,
1902 0, size) < 0)
1903 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001905 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001906 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001907 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001908 break;
1909 }
1910 case 'S':
1911 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001912 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001913 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001914 /* unused, since we already have the result */
1915 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001917 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1918 *callresult, 0,
1919 PyUnicode_GET_LENGTH(*callresult)) < 0)
1920 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001922 /* We're done with the unicode()/repr() => forget it */
1923 Py_DECREF(*callresult);
1924 /* switch to next unicode()/repr() result */
1925 ++callresult;
1926 break;
1927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001928 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001930 break;
1931 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 for (; *p; ++p, ++i)
1933 PyUnicode_WRITE(kind, data, i, *p);
1934 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001935 goto end;
1936 }
Victor Stinner1205f272010-09-11 00:54:47 +00001937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938 else {
1939 assert(i < PyUnicode_GET_LENGTH(string));
1940 PyUnicode_WRITE(kind, data, i++, *f);
1941 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001944
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001946 if (callresults)
1947 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 if (numberresults)
1949 PyObject_Free(numberresults);
1950 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001951 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001952 if (callresults) {
1953 PyObject **callresult2 = callresults;
1954 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001955 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001956 ++callresult2;
1957 }
1958 PyObject_Free(callresults);
1959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001960 if (numberresults)
1961 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001962 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001963}
1964
Walter Dörwaldd2034312007-05-18 16:29:38 +00001965PyObject *
1966PyUnicode_FromFormat(const char *format, ...)
1967{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001968 PyObject* ret;
1969 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001970
1971#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001972 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001973#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001974 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001975#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001976 ret = PyUnicode_FromFormatV(format, vargs);
1977 va_end(vargs);
1978 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001979}
1980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981#ifdef HAVE_WCHAR_H
1982
Victor Stinner5593d8a2010-10-02 11:11:27 +00001983/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1984 convert a Unicode object to a wide character string.
1985
Victor Stinnerd88d9832011-09-06 02:00:05 +02001986 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001987 character) required to convert the unicode object. Ignore size argument.
1988
Victor Stinnerd88d9832011-09-06 02:00:05 +02001989 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001990 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001991 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001992static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001993unicode_aswidechar(PyUnicodeObject *unicode,
1994 wchar_t *w,
1995 Py_ssize_t size)
1996{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001997 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 const wchar_t *wstr;
1999
2000 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2001 if (wstr == NULL)
2002 return -1;
2003
Victor Stinner5593d8a2010-10-02 11:11:27 +00002004 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002005 if (size > res)
2006 size = res + 1;
2007 else
2008 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002010 return res;
2011 }
2012 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002014}
2015
2016Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002017PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002018 wchar_t *w,
2019 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020{
2021 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002022 PyErr_BadInternalCall();
2023 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002025 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026}
2027
Victor Stinner137c34c2010-09-29 10:25:54 +00002028wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002029PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002030 Py_ssize_t *size)
2031{
2032 wchar_t* buffer;
2033 Py_ssize_t buflen;
2034
2035 if (unicode == NULL) {
2036 PyErr_BadInternalCall();
2037 return NULL;
2038 }
2039
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002040 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 if (buflen == -1)
2042 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002043 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002044 PyErr_NoMemory();
2045 return NULL;
2046 }
2047
Victor Stinner137c34c2010-09-29 10:25:54 +00002048 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2049 if (buffer == NULL) {
2050 PyErr_NoMemory();
2051 return NULL;
2052 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002053 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 if (buflen == -1)
2055 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002056 if (size != NULL)
2057 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002058 return buffer;
2059}
2060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063PyObject *
2064PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002067 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002068 PyErr_SetString(PyExc_ValueError,
2069 "chr() arg not in range(0x110000)");
2070 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002071 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002073 if (ordinal < 256)
2074 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 v = PyUnicode_New(1, ordinal);
2077 if (v == NULL)
2078 return NULL;
2079 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2080 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002081}
2082
Alexander Belopolsky40018472011-02-26 01:02:56 +00002083PyObject *
2084PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002086 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002088 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002089 if (PyUnicode_READY(obj))
2090 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 Py_INCREF(obj);
2092 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002093 }
2094 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002095 /* For a Unicode subtype that's not a Unicode object,
2096 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002097 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002098 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002099 PyErr_Format(PyExc_TypeError,
2100 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002101 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002102 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002103}
2104
Alexander Belopolsky40018472011-02-26 01:02:56 +00002105PyObject *
2106PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002107 const char *encoding,
2108 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002109{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002110 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002111 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002112
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002114 PyErr_BadInternalCall();
2115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002117
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002118 /* Decoding bytes objects is the most common case and should be fast */
2119 if (PyBytes_Check(obj)) {
2120 if (PyBytes_GET_SIZE(obj) == 0) {
2121 Py_INCREF(unicode_empty);
2122 v = (PyObject *) unicode_empty;
2123 }
2124 else {
2125 v = PyUnicode_Decode(
2126 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2127 encoding, errors);
2128 }
2129 return v;
2130 }
2131
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002132 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002133 PyErr_SetString(PyExc_TypeError,
2134 "decoding str is not supported");
2135 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002136 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002137
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002138 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2139 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2140 PyErr_Format(PyExc_TypeError,
2141 "coercing to str: need bytes, bytearray "
2142 "or buffer-like object, %.80s found",
2143 Py_TYPE(obj)->tp_name);
2144 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002145 }
Tim Petersced69f82003-09-16 20:30:58 +00002146
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002147 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002148 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002149 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 }
Tim Petersced69f82003-09-16 20:30:58 +00002151 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002152 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002153
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002154 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002155 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156}
2157
Victor Stinner600d3be2010-06-10 12:00:55 +00002158/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002159 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2160 1 on success. */
2161static int
2162normalize_encoding(const char *encoding,
2163 char *lower,
2164 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002166 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002167 char *l;
2168 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002169
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002170 e = encoding;
2171 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002172 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002173 while (*e) {
2174 if (l == l_end)
2175 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002176 if (Py_ISUPPER(*e)) {
2177 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002178 }
2179 else if (*e == '_') {
2180 *l++ = '-';
2181 e++;
2182 }
2183 else {
2184 *l++ = *e++;
2185 }
2186 }
2187 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002188 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002189}
2190
Alexander Belopolsky40018472011-02-26 01:02:56 +00002191PyObject *
2192PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002193 Py_ssize_t size,
2194 const char *encoding,
2195 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002196{
2197 PyObject *buffer = NULL, *unicode;
2198 Py_buffer info;
2199 char lower[11]; /* Enough for any encoding shortcut */
2200
2201 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002202 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002203
2204 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002205 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002206 if ((strcmp(lower, "utf-8") == 0) ||
2207 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002208 return PyUnicode_DecodeUTF8(s, size, errors);
2209 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002210 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002211 (strcmp(lower, "iso-8859-1") == 0))
2212 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002213#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002214 else if (strcmp(lower, "mbcs") == 0)
2215 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002216#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002217 else if (strcmp(lower, "ascii") == 0)
2218 return PyUnicode_DecodeASCII(s, size, errors);
2219 else if (strcmp(lower, "utf-16") == 0)
2220 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2221 else if (strcmp(lower, "utf-32") == 0)
2222 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224
2225 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002226 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002227 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002228 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002229 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 if (buffer == NULL)
2231 goto onError;
2232 unicode = PyCodec_Decode(buffer, encoding, errors);
2233 if (unicode == NULL)
2234 goto onError;
2235 if (!PyUnicode_Check(unicode)) {
2236 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002237 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002238 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 Py_DECREF(unicode);
2240 goto onError;
2241 }
2242 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 if (PyUnicode_READY(unicode)) {
2244 Py_DECREF(unicode);
2245 return NULL;
2246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002248
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 Py_XDECREF(buffer);
2251 return NULL;
2252}
2253
Alexander Belopolsky40018472011-02-26 01:02:56 +00002254PyObject *
2255PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002256 const char *encoding,
2257 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002258{
2259 PyObject *v;
2260
2261 if (!PyUnicode_Check(unicode)) {
2262 PyErr_BadArgument();
2263 goto onError;
2264 }
2265
2266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002268
2269 /* Decode via the codec registry */
2270 v = PyCodec_Decode(unicode, encoding, errors);
2271 if (v == NULL)
2272 goto onError;
2273 return v;
2274
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002276 return NULL;
2277}
2278
Alexander Belopolsky40018472011-02-26 01:02:56 +00002279PyObject *
2280PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002281 const char *encoding,
2282 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002283{
2284 PyObject *v;
2285
2286 if (!PyUnicode_Check(unicode)) {
2287 PyErr_BadArgument();
2288 goto onError;
2289 }
2290
2291 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002292 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002293
2294 /* Decode via the codec registry */
2295 v = PyCodec_Decode(unicode, encoding, errors);
2296 if (v == NULL)
2297 goto onError;
2298 if (!PyUnicode_Check(v)) {
2299 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002300 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002301 Py_TYPE(v)->tp_name);
2302 Py_DECREF(v);
2303 goto onError;
2304 }
2305 return v;
2306
Benjamin Peterson29060642009-01-31 22:14:21 +00002307 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002308 return NULL;
2309}
2310
Alexander Belopolsky40018472011-02-26 01:02:56 +00002311PyObject *
2312PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002313 Py_ssize_t size,
2314 const char *encoding,
2315 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316{
2317 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002318
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 unicode = PyUnicode_FromUnicode(s, size);
2320 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2323 Py_DECREF(unicode);
2324 return v;
2325}
2326
Alexander Belopolsky40018472011-02-26 01:02:56 +00002327PyObject *
2328PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002329 const char *encoding,
2330 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002331{
2332 PyObject *v;
2333
2334 if (!PyUnicode_Check(unicode)) {
2335 PyErr_BadArgument();
2336 goto onError;
2337 }
2338
2339 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002340 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002341
2342 /* Encode via the codec registry */
2343 v = PyCodec_Encode(unicode, encoding, errors);
2344 if (v == NULL)
2345 goto onError;
2346 return v;
2347
Benjamin Peterson29060642009-01-31 22:14:21 +00002348 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002349 return NULL;
2350}
2351
Victor Stinnerad158722010-10-27 00:25:46 +00002352PyObject *
2353PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002354{
Victor Stinner99b95382011-07-04 14:23:54 +02002355#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002356 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2357 PyUnicode_GET_SIZE(unicode),
2358 NULL);
2359#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002361#else
Victor Stinner793b5312011-04-27 00:24:21 +02002362 PyInterpreterState *interp = PyThreadState_GET()->interp;
2363 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2364 cannot use it to encode and decode filenames before it is loaded. Load
2365 the Python codec requires to encode at least its own filename. Use the C
2366 version of the locale codec until the codec registry is initialized and
2367 the Python codec is loaded.
2368
2369 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2370 cannot only rely on it: check also interp->fscodec_initialized for
2371 subinterpreters. */
2372 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002373 return PyUnicode_AsEncodedString(unicode,
2374 Py_FileSystemDefaultEncoding,
2375 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002376 }
2377 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002378 /* locale encoding with surrogateescape */
2379 wchar_t *wchar;
2380 char *bytes;
2381 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002382 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002383
2384 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2385 if (wchar == NULL)
2386 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002387 bytes = _Py_wchar2char(wchar, &error_pos);
2388 if (bytes == NULL) {
2389 if (error_pos != (size_t)-1) {
2390 char *errmsg = strerror(errno);
2391 PyObject *exc = NULL;
2392 if (errmsg == NULL)
2393 errmsg = "Py_wchar2char() failed";
2394 raise_encode_exception(&exc,
2395 "filesystemencoding",
2396 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2397 error_pos, error_pos+1,
2398 errmsg);
2399 Py_XDECREF(exc);
2400 }
2401 else
2402 PyErr_NoMemory();
2403 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002404 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002405 }
2406 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002407
2408 bytes_obj = PyBytes_FromString(bytes);
2409 PyMem_Free(bytes);
2410 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002411 }
Victor Stinnerad158722010-10-27 00:25:46 +00002412#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002413}
2414
Alexander Belopolsky40018472011-02-26 01:02:56 +00002415PyObject *
2416PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002417 const char *encoding,
2418 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419{
2420 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002421 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002422
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 if (!PyUnicode_Check(unicode)) {
2424 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002425 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 }
Fred Drakee4315f52000-05-09 19:53:39 +00002427
Victor Stinner2f283c22011-03-02 01:21:46 +00002428 if (encoding == NULL) {
2429 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002431 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002433 }
Fred Drakee4315f52000-05-09 19:53:39 +00002434
2435 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002436 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002437 if ((strcmp(lower, "utf-8") == 0) ||
2438 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002439 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002440 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002442 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002444 }
Victor Stinner37296e82010-06-10 13:36:23 +00002445 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002446 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002447 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002449#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002450 else if (strcmp(lower, "mbcs") == 0)
2451 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2452 PyUnicode_GET_SIZE(unicode),
2453 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002454#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002455 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458
2459 /* Encode via the codec registry */
2460 v = PyCodec_Encode(unicode, encoding, errors);
2461 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002462 return NULL;
2463
2464 /* The normal path */
2465 if (PyBytes_Check(v))
2466 return v;
2467
2468 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002469 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002470 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002471 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002472
2473 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2474 "encoder %s returned bytearray instead of bytes",
2475 encoding);
2476 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002477 Py_DECREF(v);
2478 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002479 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002480
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002481 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2482 Py_DECREF(v);
2483 return b;
2484 }
2485
2486 PyErr_Format(PyExc_TypeError,
2487 "encoder did not return a bytes object (type=%.400s)",
2488 Py_TYPE(v)->tp_name);
2489 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002490 return NULL;
2491}
2492
Alexander Belopolsky40018472011-02-26 01:02:56 +00002493PyObject *
2494PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002495 const char *encoding,
2496 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002497{
2498 PyObject *v;
2499
2500 if (!PyUnicode_Check(unicode)) {
2501 PyErr_BadArgument();
2502 goto onError;
2503 }
2504
2505 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002506 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002507
2508 /* Encode via the codec registry */
2509 v = PyCodec_Encode(unicode, encoding, errors);
2510 if (v == NULL)
2511 goto onError;
2512 if (!PyUnicode_Check(v)) {
2513 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002514 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002515 Py_TYPE(v)->tp_name);
2516 Py_DECREF(v);
2517 goto onError;
2518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002520
Benjamin Peterson29060642009-01-31 22:14:21 +00002521 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 return NULL;
2523}
2524
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002525PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002526PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002527 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002528 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2529}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002530
Christian Heimes5894ba72007-11-04 11:43:14 +00002531PyObject*
2532PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2533{
Victor Stinner99b95382011-07-04 14:23:54 +02002534#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002535 return PyUnicode_DecodeMBCS(s, size, NULL);
2536#elif defined(__APPLE__)
2537 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2538#else
Victor Stinner793b5312011-04-27 00:24:21 +02002539 PyInterpreterState *interp = PyThreadState_GET()->interp;
2540 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2541 cannot use it to encode and decode filenames before it is loaded. Load
2542 the Python codec requires to encode at least its own filename. Use the C
2543 version of the locale codec until the codec registry is initialized and
2544 the Python codec is loaded.
2545
2546 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2547 cannot only rely on it: check also interp->fscodec_initialized for
2548 subinterpreters. */
2549 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002550 return PyUnicode_Decode(s, size,
2551 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002552 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002553 }
2554 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002555 /* locale encoding with surrogateescape */
2556 wchar_t *wchar;
2557 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002558 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002559
2560 if (s[size] != '\0' || size != strlen(s)) {
2561 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2562 return NULL;
2563 }
2564
Victor Stinner168e1172010-10-16 23:16:16 +00002565 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002566 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002567 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002568
Victor Stinner168e1172010-10-16 23:16:16 +00002569 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002570 PyMem_Free(wchar);
2571 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002572 }
Victor Stinnerad158722010-10-27 00:25:46 +00002573#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002574}
2575
Martin v. Löwis011e8422009-05-05 04:43:17 +00002576
2577int
2578PyUnicode_FSConverter(PyObject* arg, void* addr)
2579{
2580 PyObject *output = NULL;
2581 Py_ssize_t size;
2582 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002583 if (arg == NULL) {
2584 Py_DECREF(*(PyObject**)addr);
2585 return 1;
2586 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002587 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002588 output = arg;
2589 Py_INCREF(output);
2590 }
2591 else {
2592 arg = PyUnicode_FromObject(arg);
2593 if (!arg)
2594 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002595 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002596 Py_DECREF(arg);
2597 if (!output)
2598 return 0;
2599 if (!PyBytes_Check(output)) {
2600 Py_DECREF(output);
2601 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2602 return 0;
2603 }
2604 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002605 size = PyBytes_GET_SIZE(output);
2606 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002607 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002608 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002609 Py_DECREF(output);
2610 return 0;
2611 }
2612 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002613 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002614}
2615
2616
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002617int
2618PyUnicode_FSDecoder(PyObject* arg, void* addr)
2619{
2620 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002621 if (arg == NULL) {
2622 Py_DECREF(*(PyObject**)addr);
2623 return 1;
2624 }
2625 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 if (PyUnicode_READY(arg))
2627 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002628 output = arg;
2629 Py_INCREF(output);
2630 }
2631 else {
2632 arg = PyBytes_FromObject(arg);
2633 if (!arg)
2634 return 0;
2635 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2636 PyBytes_GET_SIZE(arg));
2637 Py_DECREF(arg);
2638 if (!output)
2639 return 0;
2640 if (!PyUnicode_Check(output)) {
2641 Py_DECREF(output);
2642 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2643 return 0;
2644 }
2645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002646 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2647 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002648 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2649 Py_DECREF(output);
2650 return 0;
2651 }
2652 *(PyObject**)addr = output;
2653 return Py_CLEANUP_SUPPORTED;
2654}
2655
2656
Martin v. Löwis5b222132007-06-10 09:51:05 +00002657char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002659{
Christian Heimesf3863112007-11-22 07:46:41 +00002660 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2662
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002663 if (!PyUnicode_Check(unicode)) {
2664 PyErr_BadArgument();
2665 return NULL;
2666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002668 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002670 if (PyUnicode_UTF8(unicode) == NULL) {
2671 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2673 if (bytes == NULL)
2674 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002675 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2676 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 Py_DECREF(bytes);
2678 return NULL;
2679 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002680 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2681 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 Py_DECREF(bytes);
2683 }
2684
2685 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002686 *psize = PyUnicode_UTF8_LENGTH(unicode);
2687 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002688}
2689
2690char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002693 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2694}
2695
2696#ifdef Py_DEBUG
2697int unicode_as_unicode_calls = 0;
2698#endif
2699
2700
2701Py_UNICODE *
2702PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2703{
2704 PyUnicodeObject *u;
2705 const unsigned char *one_byte;
2706#if SIZEOF_WCHAR_T == 4
2707 const Py_UCS2 *two_bytes;
2708#else
2709 const Py_UCS4 *four_bytes;
2710 const Py_UCS4 *ucs4_end;
2711 Py_ssize_t num_surrogates;
2712#endif
2713 wchar_t *w;
2714 wchar_t *wchar_end;
2715
2716 if (!PyUnicode_Check(unicode)) {
2717 PyErr_BadArgument();
2718 return NULL;
2719 }
2720 u = (PyUnicodeObject*)unicode;
2721 if (_PyUnicode_WSTR(u) == NULL) {
2722 /* Non-ASCII compact unicode object */
2723 assert(_PyUnicode_KIND(u) != 0);
2724 assert(PyUnicode_IS_READY(u));
2725
2726#ifdef Py_DEBUG
2727 ++unicode_as_unicode_calls;
2728#endif
2729
2730 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2731#if SIZEOF_WCHAR_T == 2
2732 four_bytes = PyUnicode_4BYTE_DATA(u);
2733 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2734 num_surrogates = 0;
2735
2736 for (; four_bytes < ucs4_end; ++four_bytes) {
2737 if (*four_bytes > 0xFFFF)
2738 ++num_surrogates;
2739 }
2740
2741 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2742 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2743 if (!_PyUnicode_WSTR(u)) {
2744 PyErr_NoMemory();
2745 return NULL;
2746 }
2747 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2748
2749 w = _PyUnicode_WSTR(u);
2750 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2751 four_bytes = PyUnicode_4BYTE_DATA(u);
2752 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2753 if (*four_bytes > 0xFFFF) {
2754 /* encode surrogate pair in this case */
2755 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2756 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2757 }
2758 else
2759 *w = *four_bytes;
2760
2761 if (w > wchar_end) {
2762 assert(0 && "Miscalculated string end");
2763 }
2764 }
2765 *w = 0;
2766#else
2767 /* sizeof(wchar_t) == 4 */
2768 Py_FatalError("Impossible unicode object state, wstr and str "
2769 "should share memory already.");
2770 return NULL;
2771#endif
2772 }
2773 else {
2774 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2775 (_PyUnicode_LENGTH(u) + 1));
2776 if (!_PyUnicode_WSTR(u)) {
2777 PyErr_NoMemory();
2778 return NULL;
2779 }
2780 if (!PyUnicode_IS_COMPACT_ASCII(u))
2781 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2782 w = _PyUnicode_WSTR(u);
2783 wchar_end = w + _PyUnicode_LENGTH(u);
2784
2785 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2786 one_byte = PyUnicode_1BYTE_DATA(u);
2787 for (; w < wchar_end; ++one_byte, ++w)
2788 *w = *one_byte;
2789 /* null-terminate the wstr */
2790 *w = 0;
2791 }
2792 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2793#if SIZEOF_WCHAR_T == 4
2794 two_bytes = PyUnicode_2BYTE_DATA(u);
2795 for (; w < wchar_end; ++two_bytes, ++w)
2796 *w = *two_bytes;
2797 /* null-terminate the wstr */
2798 *w = 0;
2799#else
2800 /* sizeof(wchar_t) == 2 */
2801 PyObject_FREE(_PyUnicode_WSTR(u));
2802 _PyUnicode_WSTR(u) = NULL;
2803 Py_FatalError("Impossible unicode object state, wstr "
2804 "and str should share memory already.");
2805 return NULL;
2806#endif
2807 }
2808 else {
2809 assert(0 && "This should never happen.");
2810 }
2811 }
2812 }
2813 if (size != NULL)
2814 *size = PyUnicode_WSTR_LENGTH(u);
2815 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002816}
2817
Alexander Belopolsky40018472011-02-26 01:02:56 +00002818Py_UNICODE *
2819PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822}
2823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824
Alexander Belopolsky40018472011-02-26 01:02:56 +00002825Py_ssize_t
2826PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827{
2828 if (!PyUnicode_Check(unicode)) {
2829 PyErr_BadArgument();
2830 goto onError;
2831 }
2832 return PyUnicode_GET_SIZE(unicode);
2833
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 return -1;
2836}
2837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002838Py_ssize_t
2839PyUnicode_GetLength(PyObject *unicode)
2840{
2841 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2842 PyErr_BadArgument();
2843 return -1;
2844 }
2845
2846 return PyUnicode_GET_LENGTH(unicode);
2847}
2848
2849Py_UCS4
2850PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2851{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02002852 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
2853 PyErr_BadArgument();
2854 return (Py_UCS4)-1;
2855 }
2856 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2857 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002858 return (Py_UCS4)-1;
2859 }
2860 return PyUnicode_READ_CHAR(unicode, index);
2861}
2862
2863int
2864PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2865{
2866 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02002867 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return -1;
2869 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02002870 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2871 PyErr_SetString(PyExc_IndexError, "string index out of range");
2872 return -1;
2873 }
2874 if (_PyUnicode_Dirty(unicode))
2875 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002876 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2877 index, ch);
2878 return 0;
2879}
2880
Alexander Belopolsky40018472011-02-26 01:02:56 +00002881const char *
2882PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002883{
Victor Stinner42cb4622010-09-01 19:39:01 +00002884 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002885}
2886
Victor Stinner554f3f02010-06-16 23:33:54 +00002887/* create or adjust a UnicodeDecodeError */
2888static void
2889make_decode_exception(PyObject **exceptionObject,
2890 const char *encoding,
2891 const char *input, Py_ssize_t length,
2892 Py_ssize_t startpos, Py_ssize_t endpos,
2893 const char *reason)
2894{
2895 if (*exceptionObject == NULL) {
2896 *exceptionObject = PyUnicodeDecodeError_Create(
2897 encoding, input, length, startpos, endpos, reason);
2898 }
2899 else {
2900 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2901 goto onError;
2902 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2903 goto onError;
2904 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2905 goto onError;
2906 }
2907 return;
2908
2909onError:
2910 Py_DECREF(*exceptionObject);
2911 *exceptionObject = NULL;
2912}
2913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002914/* error handling callback helper:
2915 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002916 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002917 and adjust various state variables.
2918 return 0 on success, -1 on error
2919*/
2920
Alexander Belopolsky40018472011-02-26 01:02:56 +00002921static int
2922unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002923 const char *encoding, const char *reason,
2924 const char **input, const char **inend, Py_ssize_t *startinpos,
2925 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2926 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002928 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002929
2930 PyObject *restuple = NULL;
2931 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002932 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002933 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002934 Py_ssize_t requiredsize;
2935 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002936 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002937 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002938 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 int res = -1;
2940
2941 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 *errorHandler = PyCodec_LookupError(errors);
2943 if (*errorHandler == NULL)
2944 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945 }
2946
Victor Stinner554f3f02010-06-16 23:33:54 +00002947 make_decode_exception(exceptionObject,
2948 encoding,
2949 *input, *inend - *input,
2950 *startinpos, *endinpos,
2951 reason);
2952 if (*exceptionObject == NULL)
2953 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002954
2955 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2956 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002957 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002959 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002960 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 }
2962 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002963 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002964
2965 /* Copy back the bytes variables, which might have been modified by the
2966 callback */
2967 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2968 if (!inputobj)
2969 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002970 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002971 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002972 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002973 *input = PyBytes_AS_STRING(inputobj);
2974 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002975 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002976 /* we can DECREF safely, as the exception has another reference,
2977 so the object won't go away. */
2978 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002982 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2984 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002985 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986
2987 /* need more space? (at least enough for what we
2988 have+the replacement+the rest of the string (starting
2989 at the new input position), so we won't have to check space
2990 when there are no errors in the rest of the string) */
2991 repptr = PyUnicode_AS_UNICODE(repunicode);
2992 repsize = PyUnicode_GET_SIZE(repunicode);
2993 requiredsize = *outpos + repsize + insize-newpos;
2994 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 if (requiredsize<2*outsize)
2996 requiredsize = 2*outsize;
2997 if (_PyUnicode_Resize(output, requiredsize) < 0)
2998 goto onError;
2999 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 }
3001 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003002 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003 Py_UNICODE_COPY(*outptr, repptr, repsize);
3004 *outptr += repsize;
3005 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 /* we made it! */
3008 res = 0;
3009
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011 Py_XDECREF(restuple);
3012 return res;
3013}
3014
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003015/* --- UTF-7 Codec -------------------------------------------------------- */
3016
Antoine Pitrou244651a2009-05-04 18:56:13 +00003017/* See RFC2152 for details. We encode conservatively and decode liberally. */
3018
3019/* Three simple macros defining base-64. */
3020
3021/* Is c a base-64 character? */
3022
3023#define IS_BASE64(c) \
3024 (((c) >= 'A' && (c) <= 'Z') || \
3025 ((c) >= 'a' && (c) <= 'z') || \
3026 ((c) >= '0' && (c) <= '9') || \
3027 (c) == '+' || (c) == '/')
3028
3029/* given that c is a base-64 character, what is its base-64 value? */
3030
3031#define FROM_BASE64(c) \
3032 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3033 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3034 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3035 (c) == '+' ? 62 : 63)
3036
3037/* What is the base-64 character of the bottom 6 bits of n? */
3038
3039#define TO_BASE64(n) \
3040 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3041
3042/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3043 * decoded as itself. We are permissive on decoding; the only ASCII
3044 * byte not decoding to itself is the + which begins a base64
3045 * string. */
3046
3047#define DECODE_DIRECT(c) \
3048 ((c) <= 127 && (c) != '+')
3049
3050/* The UTF-7 encoder treats ASCII characters differently according to
3051 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3052 * the above). See RFC2152. This array identifies these different
3053 * sets:
3054 * 0 : "Set D"
3055 * alphanumeric and '(),-./:?
3056 * 1 : "Set O"
3057 * !"#$%&*;<=>@[]^_`{|}
3058 * 2 : "whitespace"
3059 * ht nl cr sp
3060 * 3 : special (must be base64 encoded)
3061 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3062 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003063
Tim Petersced69f82003-09-16 20:30:58 +00003064static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003065char utf7_category[128] = {
3066/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3067 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3068/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3069 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3070/* sp ! " # $ % & ' ( ) * + , - . / */
3071 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3072/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3074/* @ A B C D E F G H I J K L M N O */
3075 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3076/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3078/* ` a b c d e f g h i j k l m n o */
3079 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3080/* p q r s t u v w x y z { | } ~ del */
3081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003082};
3083
Antoine Pitrou244651a2009-05-04 18:56:13 +00003084/* ENCODE_DIRECT: this character should be encoded as itself. The
3085 * answer depends on whether we are encoding set O as itself, and also
3086 * on whether we are encoding whitespace as itself. RFC2152 makes it
3087 * clear that the answers to these questions vary between
3088 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003089
Antoine Pitrou244651a2009-05-04 18:56:13 +00003090#define ENCODE_DIRECT(c, directO, directWS) \
3091 ((c) < 128 && (c) > 0 && \
3092 ((utf7_category[(c)] == 0) || \
3093 (directWS && (utf7_category[(c)] == 2)) || \
3094 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003095
Alexander Belopolsky40018472011-02-26 01:02:56 +00003096PyObject *
3097PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003098 Py_ssize_t size,
3099 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003100{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003101 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3102}
3103
Antoine Pitrou244651a2009-05-04 18:56:13 +00003104/* The decoder. The only state we preserve is our read position,
3105 * i.e. how many characters we have consumed. So if we end in the
3106 * middle of a shift sequence we have to back off the read position
3107 * and the output to the beginning of the sequence, otherwise we lose
3108 * all the shift state (seen bits, number of bits seen, high
3109 * surrogate). */
3110
Alexander Belopolsky40018472011-02-26 01:02:56 +00003111PyObject *
3112PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003113 Py_ssize_t size,
3114 const char *errors,
3115 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003118 Py_ssize_t startinpos;
3119 Py_ssize_t endinpos;
3120 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003121 const char *e;
3122 PyUnicodeObject *unicode;
3123 Py_UNICODE *p;
3124 const char *errmsg = "";
3125 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003126 Py_UNICODE *shiftOutStart;
3127 unsigned int base64bits = 0;
3128 unsigned long base64buffer = 0;
3129 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 PyObject *errorHandler = NULL;
3131 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003132
3133 unicode = _PyUnicode_New(size);
3134 if (!unicode)
3135 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003136 if (size == 0) {
3137 if (consumed)
3138 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003139 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003140 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003142 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003143 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003144 e = s + size;
3145
3146 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003148 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003149 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003150
Antoine Pitrou244651a2009-05-04 18:56:13 +00003151 if (inShift) { /* in a base-64 section */
3152 if (IS_BASE64(ch)) { /* consume a base-64 character */
3153 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3154 base64bits += 6;
3155 s++;
3156 if (base64bits >= 16) {
3157 /* we have enough bits for a UTF-16 value */
3158 Py_UNICODE outCh = (Py_UNICODE)
3159 (base64buffer >> (base64bits-16));
3160 base64bits -= 16;
3161 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3162 if (surrogate) {
3163 /* expecting a second surrogate */
3164 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3165#ifdef Py_UNICODE_WIDE
3166 *p++ = (((surrogate & 0x3FF)<<10)
3167 | (outCh & 0x3FF)) + 0x10000;
3168#else
3169 *p++ = surrogate;
3170 *p++ = outCh;
3171#endif
3172 surrogate = 0;
3173 }
3174 else {
3175 surrogate = 0;
3176 errmsg = "second surrogate missing";
3177 goto utf7Error;
3178 }
3179 }
3180 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3181 /* first surrogate */
3182 surrogate = outCh;
3183 }
3184 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3185 errmsg = "unexpected second surrogate";
3186 goto utf7Error;
3187 }
3188 else {
3189 *p++ = outCh;
3190 }
3191 }
3192 }
3193 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003194 inShift = 0;
3195 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003196 if (surrogate) {
3197 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003198 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003199 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003200 if (base64bits > 0) { /* left-over bits */
3201 if (base64bits >= 6) {
3202 /* We've seen at least one base-64 character */
3203 errmsg = "partial character in shift sequence";
3204 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003205 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003206 else {
3207 /* Some bits remain; they should be zero */
3208 if (base64buffer != 0) {
3209 errmsg = "non-zero padding bits in shift sequence";
3210 goto utf7Error;
3211 }
3212 }
3213 }
3214 if (ch != '-') {
3215 /* '-' is absorbed; other terminating
3216 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003217 *p++ = ch;
3218 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003219 }
3220 }
3221 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003222 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003223 s++; /* consume '+' */
3224 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003225 s++;
3226 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003227 }
3228 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003229 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003230 shiftOutStart = p;
3231 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003232 }
3233 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003234 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003235 *p++ = ch;
3236 s++;
3237 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003238 else {
3239 startinpos = s-starts;
3240 s++;
3241 errmsg = "unexpected special character";
3242 goto utf7Error;
3243 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003244 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003245utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 outpos = p-PyUnicode_AS_UNICODE(unicode);
3247 endinpos = s-starts;
3248 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 errors, &errorHandler,
3250 "utf7", errmsg,
3251 &starts, &e, &startinpos, &endinpos, &exc, &s,
3252 &unicode, &outpos, &p))
3253 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003254 }
3255
Antoine Pitrou244651a2009-05-04 18:56:13 +00003256 /* end of string */
3257
3258 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3259 /* if we're in an inconsistent state, that's an error */
3260 if (surrogate ||
3261 (base64bits >= 6) ||
3262 (base64bits > 0 && base64buffer != 0)) {
3263 outpos = p-PyUnicode_AS_UNICODE(unicode);
3264 endinpos = size;
3265 if (unicode_decode_call_errorhandler(
3266 errors, &errorHandler,
3267 "utf7", "unterminated shift sequence",
3268 &starts, &e, &startinpos, &endinpos, &exc, &s,
3269 &unicode, &outpos, &p))
3270 goto onError;
3271 if (s < e)
3272 goto restart;
3273 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003274 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003275
3276 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003277 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003278 if (inShift) {
3279 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003280 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003281 }
3282 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003283 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003284 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003285 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003286
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003287 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003288 goto onError;
3289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 Py_XDECREF(errorHandler);
3291 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003292 if (PyUnicode_READY(unicode) == -1) {
3293 Py_DECREF(unicode);
3294 return NULL;
3295 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003296 return (PyObject *)unicode;
3297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 Py_XDECREF(errorHandler);
3300 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003301 Py_DECREF(unicode);
3302 return NULL;
3303}
3304
3305
Alexander Belopolsky40018472011-02-26 01:02:56 +00003306PyObject *
3307PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003308 Py_ssize_t size,
3309 int base64SetO,
3310 int base64WhiteSpace,
3311 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003312{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003313 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003314 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003315 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003316 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003318 unsigned int base64bits = 0;
3319 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003320 char * out;
3321 char * start;
3322
3323 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003325
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003326 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003327 return PyErr_NoMemory();
3328
Antoine Pitrou244651a2009-05-04 18:56:13 +00003329 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003330 if (v == NULL)
3331 return NULL;
3332
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003333 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003334 for (;i < size; ++i) {
3335 Py_UNICODE ch = s[i];
3336
Antoine Pitrou244651a2009-05-04 18:56:13 +00003337 if (inShift) {
3338 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3339 /* shifting out */
3340 if (base64bits) { /* output remaining bits */
3341 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3342 base64buffer = 0;
3343 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003344 }
3345 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003346 /* Characters not in the BASE64 set implicitly unshift the sequence
3347 so no '-' is required, except if the character is itself a '-' */
3348 if (IS_BASE64(ch) || ch == '-') {
3349 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003350 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003351 *out++ = (char) ch;
3352 }
3353 else {
3354 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003355 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003356 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003357 else { /* not in a shift sequence */
3358 if (ch == '+') {
3359 *out++ = '+';
3360 *out++ = '-';
3361 }
3362 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3363 *out++ = (char) ch;
3364 }
3365 else {
3366 *out++ = '+';
3367 inShift = 1;
3368 goto encode_char;
3369 }
3370 }
3371 continue;
3372encode_char:
3373#ifdef Py_UNICODE_WIDE
3374 if (ch >= 0x10000) {
3375 /* code first surrogate */
3376 base64bits += 16;
3377 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3378 while (base64bits >= 6) {
3379 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3380 base64bits -= 6;
3381 }
3382 /* prepare second surrogate */
3383 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3384 }
3385#endif
3386 base64bits += 16;
3387 base64buffer = (base64buffer << 16) | ch;
3388 while (base64bits >= 6) {
3389 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3390 base64bits -= 6;
3391 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003392 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003393 if (base64bits)
3394 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3395 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003396 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003397 if (_PyBytes_Resize(&v, out - start) < 0)
3398 return NULL;
3399 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003400}
3401
Antoine Pitrou244651a2009-05-04 18:56:13 +00003402#undef IS_BASE64
3403#undef FROM_BASE64
3404#undef TO_BASE64
3405#undef DECODE_DIRECT
3406#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003407
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408/* --- UTF-8 Codec -------------------------------------------------------- */
3409
Tim Petersced69f82003-09-16 20:30:58 +00003410static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003412 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3413 illegal prefix. See RFC 3629 for details */
3414 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3415 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003416 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3418 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3419 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3420 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003421 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3426 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3427 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3428 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3429 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430};
3431
Alexander Belopolsky40018472011-02-26 01:02:56 +00003432PyObject *
3433PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003434 Py_ssize_t size,
3435 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436{
Walter Dörwald69652032004-09-07 20:24:22 +00003437 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3438}
3439
Antoine Pitrouab868312009-01-10 15:40:25 +00003440/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3441#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3442
3443/* Mask to quickly check whether a C 'long' contains a
3444 non-ASCII, UTF8-encoded char. */
3445#if (SIZEOF_LONG == 8)
3446# define ASCII_CHAR_MASK 0x8080808080808080L
3447#elif (SIZEOF_LONG == 4)
3448# define ASCII_CHAR_MASK 0x80808080L
3449#else
3450# error C 'long' size should be either 4 or 8!
3451#endif
3452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453/* Scans a UTF-8 string and returns the maximum character to be expected,
3454 the size of the decoded unicode string and if any major errors were
3455 encountered.
3456
3457 This function does check basic UTF-8 sanity, it does however NOT CHECK
3458 if the string contains surrogates, and if all continuation bytes are
3459 within the correct ranges, these checks are performed in
3460 PyUnicode_DecodeUTF8Stateful.
3461
3462 If it sets has_errors to 1, it means the value of unicode_size and max_char
3463 will be bogus and you should not rely on useful information in them.
3464 */
3465static Py_UCS4
3466utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3467 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3468 int *has_errors)
3469{
3470 Py_ssize_t n;
3471 Py_ssize_t char_count = 0;
3472 Py_UCS4 max_char = 127, new_max;
3473 Py_UCS4 upper_bound;
3474 const unsigned char *p = (const unsigned char *)s;
3475 const unsigned char *end = p + string_size;
3476 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3477 int err = 0;
3478
3479 for (; p < end && !err; ++p, ++char_count) {
3480 /* Only check value if it's not a ASCII char... */
3481 if (*p < 0x80) {
3482 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3483 an explanation. */
3484 if (!((size_t) p & LONG_PTR_MASK)) {
3485 /* Help register allocation */
3486 register const unsigned char *_p = p;
3487 while (_p < aligned_end) {
3488 unsigned long value = *(unsigned long *) _p;
3489 if (value & ASCII_CHAR_MASK)
3490 break;
3491 _p += SIZEOF_LONG;
3492 char_count += SIZEOF_LONG;
3493 }
3494 p = _p;
3495 if (p == end)
3496 break;
3497 }
3498 }
3499 if (*p >= 0x80) {
3500 n = utf8_code_length[*p];
3501 new_max = max_char;
3502 switch (n) {
3503 /* invalid start byte */
3504 case 0:
3505 err = 1;
3506 break;
3507 case 2:
3508 /* Code points between 0x00FF and 0x07FF inclusive.
3509 Approximate the upper bound of the code point,
3510 if this flips over 255 we can be sure it will be more
3511 than 255 and the string will need 2 bytes per code coint,
3512 if it stays under or equal to 255, we can be sure 1 byte
3513 is enough.
3514 ((*p & 0b00011111) << 6) | 0b00111111 */
3515 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3516 if (max_char < upper_bound)
3517 new_max = upper_bound;
3518 /* Ensure we track at least that we left ASCII space. */
3519 if (new_max < 128)
3520 new_max = 128;
3521 break;
3522 case 3:
3523 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3524 always > 255 and <= 65535 and will always need 2 bytes. */
3525 if (max_char < 65535)
3526 new_max = 65535;
3527 break;
3528 case 4:
3529 /* Code point will be above 0xFFFF for sure in this case. */
3530 new_max = 65537;
3531 break;
3532 /* Internal error, this should be caught by the first if */
3533 case 1:
3534 default:
3535 assert(0 && "Impossible case in utf8_max_char_and_size");
3536 err = 1;
3537 }
3538 /* Instead of number of overall bytes for this code point,
3539 n containts the number of following bytes: */
3540 --n;
3541 /* Check if the follow up chars are all valid continuation bytes */
3542 if (n >= 1) {
3543 const unsigned char *cont;
3544 if ((p + n) >= end) {
3545 if (consumed == 0)
3546 /* incomplete data, non-incremental decoding */
3547 err = 1;
3548 break;
3549 }
3550 for (cont = p + 1; cont < (p + n); ++cont) {
3551 if ((*cont & 0xc0) != 0x80) {
3552 err = 1;
3553 break;
3554 }
3555 }
3556 p += n;
3557 }
3558 else
3559 err = 1;
3560 max_char = new_max;
3561 }
3562 }
3563
3564 if (unicode_size)
3565 *unicode_size = char_count;
3566 if (has_errors)
3567 *has_errors = err;
3568 return max_char;
3569}
3570
3571/* Similar to PyUnicode_WRITE but can also write into wstr field
3572 of the legacy unicode representation */
3573#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3574 do { \
3575 const int k_ = (kind); \
3576 if (k_ == PyUnicode_WCHAR_KIND) \
3577 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3578 else if (k_ == PyUnicode_1BYTE_KIND) \
3579 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3580 else if (k_ == PyUnicode_2BYTE_KIND) \
3581 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3582 else \
3583 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3584 } while (0)
3585
Alexander Belopolsky40018472011-02-26 01:02:56 +00003586PyObject *
3587PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003588 Py_ssize_t size,
3589 const char *errors,
3590 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003591{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003594 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003595 Py_ssize_t startinpos;
3596 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003597 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003599 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 PyObject *errorHandler = NULL;
3601 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003602 Py_UCS4 maxchar = 0;
3603 Py_ssize_t unicode_size;
3604 Py_ssize_t i;
3605 int kind;
3606 void *data;
3607 int has_errors;
3608 Py_UNICODE *error_outptr;
3609#if SIZEOF_WCHAR_T == 2
3610 Py_ssize_t wchar_offset = 0;
3611#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612
Walter Dörwald69652032004-09-07 20:24:22 +00003613 if (size == 0) {
3614 if (consumed)
3615 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003616 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003618 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3619 consumed, &has_errors);
3620 if (has_errors) {
3621 unicode = _PyUnicode_New(size);
3622 if (!unicode)
3623 return NULL;
3624 kind = PyUnicode_WCHAR_KIND;
3625 data = PyUnicode_AS_UNICODE(unicode);
3626 assert(data != NULL);
3627 }
3628 else {
3629 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3630 if (!unicode)
3631 return NULL;
3632 /* When the string is ASCII only, just use memcpy and return.
3633 unicode_size may be != size if there is an incomplete UTF-8
3634 sequence at the end of the ASCII block. */
3635 if (maxchar < 128 && size == unicode_size) {
3636 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3637 return (PyObject *)unicode;
3638 }
3639 kind = PyUnicode_KIND(unicode);
3640 data = PyUnicode_DATA(unicode);
3641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003643 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003645 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646
3647 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003648 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649
3650 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003651 /* Fast path for runs of ASCII characters. Given that common UTF-8
3652 input will consist of an overwhelming majority of ASCII
3653 characters, we try to optimize for this case by checking
3654 as many characters as a C 'long' can contain.
3655 First, check if we can do an aligned read, as most CPUs have
3656 a penalty for unaligned reads.
3657 */
3658 if (!((size_t) s & LONG_PTR_MASK)) {
3659 /* Help register allocation */
3660 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003662 while (_s < aligned_end) {
3663 /* Read a whole long at a time (either 4 or 8 bytes),
3664 and do a fast unrolled copy if it only contains ASCII
3665 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003666 unsigned long value = *(unsigned long *) _s;
3667 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003668 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003669 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3670 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3671 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3672 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003673#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003674 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3675 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3676 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3677 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003678#endif
3679 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003681 }
3682 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003684 if (s == e)
3685 break;
3686 ch = (unsigned char)*s;
3687 }
3688 }
3689
3690 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003691 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 s++;
3693 continue;
3694 }
3695
3696 n = utf8_code_length[ch];
3697
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003698 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003699 if (consumed)
3700 break;
3701 else {
3702 errmsg = "unexpected end of data";
3703 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003704 endinpos = startinpos+1;
3705 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3706 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 goto utf8Error;
3708 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710
3711 switch (n) {
3712
3713 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003714 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003715 startinpos = s-starts;
3716 endinpos = startinpos+1;
3717 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718
3719 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003720 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 startinpos = s-starts;
3722 endinpos = startinpos+1;
3723 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724
3725 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003726 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003727 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003729 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003730 goto utf8Error;
3731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003733 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 break;
3736
3737 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003738 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3739 will result in surrogates in range d800-dfff. Surrogates are
3740 not valid UTF-8 so they are rejected.
3741 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3742 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003743 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003744 (s[2] & 0xc0) != 0x80 ||
3745 ((unsigned char)s[0] == 0xE0 &&
3746 (unsigned char)s[1] < 0xA0) ||
3747 ((unsigned char)s[0] == 0xED &&
3748 (unsigned char)s[1] > 0x9F)) {
3749 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003750 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003751 endinpos = startinpos + 1;
3752
3753 /* if s[1] first two bits are 1 and 0, then the invalid
3754 continuation byte is s[2], so increment endinpos by 1,
3755 if not, s[1] is invalid and endinpos doesn't need to
3756 be incremented. */
3757 if ((s[1] & 0xC0) == 0x80)
3758 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 goto utf8Error;
3760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003762 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003764 break;
3765
3766 case 4:
3767 if ((s[1] & 0xc0) != 0x80 ||
3768 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003769 (s[3] & 0xc0) != 0x80 ||
3770 ((unsigned char)s[0] == 0xF0 &&
3771 (unsigned char)s[1] < 0x90) ||
3772 ((unsigned char)s[0] == 0xF4 &&
3773 (unsigned char)s[1] > 0x8F)) {
3774 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003775 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003776 endinpos = startinpos + 1;
3777 if ((s[1] & 0xC0) == 0x80) {
3778 endinpos++;
3779 if ((s[2] & 0xC0) == 0x80)
3780 endinpos++;
3781 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 goto utf8Error;
3783 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003784 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003785 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3786 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 /* If the string is flexible or we have native UCS-4, write
3789 directly.. */
3790 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3791 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 else {
3794 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 /* translate from 10000..10FFFF to 0..FFFF */
3797 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 /* high surrogate = top 10 bits added to D800 */
3800 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3801 (Py_UNICODE)(0xD800 + (ch >> 10)));
3802
3803 /* low surrogate = bottom 10 bits added to DC00 */
3804 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3805 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3806 }
3807#if SIZEOF_WCHAR_T == 2
3808 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003809#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 }
3812 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003813 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003814
Benjamin Peterson29060642009-01-31 22:14:21 +00003815 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816 /* If this is not yet a resizable string, make it one.. */
3817 if (kind != PyUnicode_WCHAR_KIND) {
3818 const Py_UNICODE *u;
3819 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3820 if (!new_unicode)
3821 goto onError;
3822 u = PyUnicode_AsUnicode((PyObject *)unicode);
3823 if (!u)
3824 goto onError;
3825#if SIZEOF_WCHAR_T == 2
3826 i += wchar_offset;
3827#endif
3828 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3829 Py_DECREF(unicode);
3830 unicode = new_unicode;
3831 kind = 0;
3832 data = PyUnicode_AS_UNICODE(new_unicode);
3833 assert(data != NULL);
3834 }
3835 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 if (unicode_decode_call_errorhandler(
3837 errors, &errorHandler,
3838 "utf8", errmsg,
3839 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842 /* Update data because unicode_decode_call_errorhandler might have
3843 re-created or resized the unicode object. */
3844 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003845 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 /* Ensure the unicode_size calculation above was correct: */
3848 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3849
Walter Dörwald69652032004-09-07 20:24:22 +00003850 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 /* Adjust length and ready string when it contained errors and
3854 is of the old resizable kind. */
3855 if (kind == PyUnicode_WCHAR_KIND) {
3856 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3857 PyUnicode_READY(unicode) == -1)
3858 goto onError;
3859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 Py_XDECREF(errorHandler);
3862 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 if (PyUnicode_READY(unicode) == -1) {
3864 Py_DECREF(unicode);
3865 return NULL;
3866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867 return (PyObject *)unicode;
3868
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 Py_XDECREF(errorHandler);
3871 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 Py_DECREF(unicode);
3873 return NULL;
3874}
3875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003877
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003878#ifdef __APPLE__
3879
3880/* Simplified UTF-8 decoder using surrogateescape error handler,
3881 used to decode the command line arguments on Mac OS X. */
3882
3883wchar_t*
3884_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3885{
3886 int n;
3887 const char *e;
3888 wchar_t *unicode, *p;
3889
3890 /* Note: size will always be longer than the resulting Unicode
3891 character count */
3892 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3893 PyErr_NoMemory();
3894 return NULL;
3895 }
3896 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3897 if (!unicode)
3898 return NULL;
3899
3900 /* Unpack UTF-8 encoded data */
3901 p = unicode;
3902 e = s + size;
3903 while (s < e) {
3904 Py_UCS4 ch = (unsigned char)*s;
3905
3906 if (ch < 0x80) {
3907 *p++ = (wchar_t)ch;
3908 s++;
3909 continue;
3910 }
3911
3912 n = utf8_code_length[ch];
3913 if (s + n > e) {
3914 goto surrogateescape;
3915 }
3916
3917 switch (n) {
3918 case 0:
3919 case 1:
3920 goto surrogateescape;
3921
3922 case 2:
3923 if ((s[1] & 0xc0) != 0x80)
3924 goto surrogateescape;
3925 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3926 assert ((ch > 0x007F) && (ch <= 0x07FF));
3927 *p++ = (wchar_t)ch;
3928 break;
3929
3930 case 3:
3931 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3932 will result in surrogates in range d800-dfff. Surrogates are
3933 not valid UTF-8 so they are rejected.
3934 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3935 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3936 if ((s[1] & 0xc0) != 0x80 ||
3937 (s[2] & 0xc0) != 0x80 ||
3938 ((unsigned char)s[0] == 0xE0 &&
3939 (unsigned char)s[1] < 0xA0) ||
3940 ((unsigned char)s[0] == 0xED &&
3941 (unsigned char)s[1] > 0x9F)) {
3942
3943 goto surrogateescape;
3944 }
3945 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3946 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003948 break;
3949
3950 case 4:
3951 if ((s[1] & 0xc0) != 0x80 ||
3952 (s[2] & 0xc0) != 0x80 ||
3953 (s[3] & 0xc0) != 0x80 ||
3954 ((unsigned char)s[0] == 0xF0 &&
3955 (unsigned char)s[1] < 0x90) ||
3956 ((unsigned char)s[0] == 0xF4 &&
3957 (unsigned char)s[1] > 0x8F)) {
3958 goto surrogateescape;
3959 }
3960 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3961 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3962 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3963
3964#if SIZEOF_WCHAR_T == 4
3965 *p++ = (wchar_t)ch;
3966#else
3967 /* compute and append the two surrogates: */
3968
3969 /* translate from 10000..10FFFF to 0..FFFF */
3970 ch -= 0x10000;
3971
3972 /* high surrogate = top 10 bits added to D800 */
3973 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3974
3975 /* low surrogate = bottom 10 bits added to DC00 */
3976 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3977#endif
3978 break;
3979 }
3980 s += n;
3981 continue;
3982
3983 surrogateescape:
3984 *p++ = 0xDC00 + ch;
3985 s++;
3986 }
3987 *p = L'\0';
3988 return unicode;
3989}
3990
3991#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993/* Primary internal function which creates utf8 encoded bytes objects.
3994
3995 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003996 and allocate exactly as much space needed at the end. Else allocate the
3997 maximum possible needed (4 result bytes per Unicode character), and return
3998 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003999*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004000PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002{
Tim Peters602f7402002-04-27 18:03:26 +00004003#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004004
Guido van Rossum98297ee2007-11-06 21:34:58 +00004005 Py_ssize_t i; /* index into s of next input byte */
4006 PyObject *result; /* result string object */
4007 char *p; /* next free byte in output buffer */
4008 Py_ssize_t nallocated; /* number of result bytes allocated */
4009 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004010 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004011 PyObject *errorHandler = NULL;
4012 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 int kind;
4014 void *data;
4015 Py_ssize_t size;
4016 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4017#if SIZEOF_WCHAR_T == 2
4018 Py_ssize_t wchar_offset = 0;
4019#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 if (!PyUnicode_Check(unicode)) {
4022 PyErr_BadArgument();
4023 return NULL;
4024 }
4025
4026 if (PyUnicode_READY(unicode) == -1)
4027 return NULL;
4028
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004029 if (PyUnicode_UTF8(unicode))
4030 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4031 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032
4033 kind = PyUnicode_KIND(unicode);
4034 data = PyUnicode_DATA(unicode);
4035 size = PyUnicode_GET_LENGTH(unicode);
4036
Tim Peters602f7402002-04-27 18:03:26 +00004037 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038
Tim Peters602f7402002-04-27 18:03:26 +00004039 if (size <= MAX_SHORT_UNICHARS) {
4040 /* Write into the stack buffer; nallocated can't overflow.
4041 * At the end, we'll allocate exactly as much heap space as it
4042 * turns out we need.
4043 */
4044 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004045 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004046 p = stackbuf;
4047 }
4048 else {
4049 /* Overallocate on the heap, and give the excess back at the end. */
4050 nallocated = size * 4;
4051 if (nallocated / 4 != size) /* overflow! */
4052 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004053 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004054 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004055 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004056 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004057 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004058
Tim Peters602f7402002-04-27 18:03:26 +00004059 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004061
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004062 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004063 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004065
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004067 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004068 *p++ = (char)(0xc0 | (ch >> 6));
4069 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004070 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 Py_ssize_t newpos;
4072 PyObject *rep;
4073 Py_ssize_t repsize, k, startpos;
4074 startpos = i-1;
4075#if SIZEOF_WCHAR_T == 2
4076 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004077#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 rep = unicode_encode_call_errorhandler(
4079 errors, &errorHandler, "utf-8", "surrogates not allowed",
4080 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4081 &exc, startpos, startpos+1, &newpos);
4082 if (!rep)
4083 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085 if (PyBytes_Check(rep))
4086 repsize = PyBytes_GET_SIZE(rep);
4087 else
4088 repsize = PyUnicode_GET_SIZE(rep);
4089
4090 if (repsize > 4) {
4091 Py_ssize_t offset;
4092
4093 if (result == NULL)
4094 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004095 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4099 /* integer overflow */
4100 PyErr_NoMemory();
4101 goto error;
4102 }
4103 nallocated += repsize - 4;
4104 if (result != NULL) {
4105 if (_PyBytes_Resize(&result, nallocated) < 0)
4106 goto error;
4107 } else {
4108 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004109 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 goto error;
4111 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4112 }
4113 p = PyBytes_AS_STRING(result) + offset;
4114 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 if (PyBytes_Check(rep)) {
4117 char *prep = PyBytes_AS_STRING(rep);
4118 for(k = repsize; k > 0; k--)
4119 *p++ = *prep++;
4120 } else /* rep is unicode */ {
4121 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4122 Py_UNICODE c;
4123
4124 for(k=0; k<repsize; k++) {
4125 c = prep[k];
4126 if (0x80 <= c) {
4127 raise_encode_exception(&exc, "utf-8",
4128 PyUnicode_AS_UNICODE(unicode),
4129 size, i-1, i,
4130 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004131 goto error;
4132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004134 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004137 } else if (ch < 0x10000) {
4138 *p++ = (char)(0xe0 | (ch >> 12));
4139 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4140 *p++ = (char)(0x80 | (ch & 0x3f));
4141 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004142 /* Encode UCS4 Unicode ordinals */
4143 *p++ = (char)(0xf0 | (ch >> 18));
4144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4146 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147#if SIZEOF_WCHAR_T == 2
4148 wchar_offset++;
4149#endif
Tim Peters602f7402002-04-27 18:03:26 +00004150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004152
Guido van Rossum98297ee2007-11-06 21:34:58 +00004153 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004154 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004155 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004156 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004157 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004158 }
4159 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004160 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004161 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004162 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004163 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004166 Py_XDECREF(errorHandler);
4167 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004168 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004169 error:
4170 Py_XDECREF(errorHandler);
4171 Py_XDECREF(exc);
4172 Py_XDECREF(result);
4173 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004174
Tim Peters602f7402002-04-27 18:03:26 +00004175#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176}
4177
Alexander Belopolsky40018472011-02-26 01:02:56 +00004178PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4180 Py_ssize_t size,
4181 const char *errors)
4182{
4183 PyObject *v, *unicode;
4184
4185 unicode = PyUnicode_FromUnicode(s, size);
4186 if (unicode == NULL)
4187 return NULL;
4188 v = _PyUnicode_AsUTF8String(unicode, errors);
4189 Py_DECREF(unicode);
4190 return v;
4191}
4192
4193PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004194PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197}
4198
Walter Dörwald41980ca2007-08-16 21:55:45 +00004199/* --- UTF-32 Codec ------------------------------------------------------- */
4200
4201PyObject *
4202PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 Py_ssize_t size,
4204 const char *errors,
4205 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004206{
4207 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4208}
4209
4210PyObject *
4211PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 Py_ssize_t size,
4213 const char *errors,
4214 int *byteorder,
4215 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004216{
4217 const char *starts = s;
4218 Py_ssize_t startinpos;
4219 Py_ssize_t endinpos;
4220 Py_ssize_t outpos;
4221 PyUnicodeObject *unicode;
4222 Py_UNICODE *p;
4223#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004224 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004225 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004226#else
4227 const int pairs = 0;
4228#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004229 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004230 int bo = 0; /* assume native ordering by default */
4231 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004232 /* Offsets from q for retrieving bytes in the right order. */
4233#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4234 int iorder[] = {0, 1, 2, 3};
4235#else
4236 int iorder[] = {3, 2, 1, 0};
4237#endif
4238 PyObject *errorHandler = NULL;
4239 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004240
Walter Dörwald41980ca2007-08-16 21:55:45 +00004241 q = (unsigned char *)s;
4242 e = q + size;
4243
4244 if (byteorder)
4245 bo = *byteorder;
4246
4247 /* Check for BOM marks (U+FEFF) in the input and adjust current
4248 byte order setting accordingly. In native mode, the leading BOM
4249 mark is skipped, in all other modes, it is copied to the output
4250 stream as-is (giving a ZWNBSP character). */
4251 if (bo == 0) {
4252 if (size >= 4) {
4253 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004255#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 if (bom == 0x0000FEFF) {
4257 q += 4;
4258 bo = -1;
4259 }
4260 else if (bom == 0xFFFE0000) {
4261 q += 4;
4262 bo = 1;
4263 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004264#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 if (bom == 0x0000FEFF) {
4266 q += 4;
4267 bo = 1;
4268 }
4269 else if (bom == 0xFFFE0000) {
4270 q += 4;
4271 bo = -1;
4272 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004273#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004275 }
4276
4277 if (bo == -1) {
4278 /* force LE */
4279 iorder[0] = 0;
4280 iorder[1] = 1;
4281 iorder[2] = 2;
4282 iorder[3] = 3;
4283 }
4284 else if (bo == 1) {
4285 /* force BE */
4286 iorder[0] = 3;
4287 iorder[1] = 2;
4288 iorder[2] = 1;
4289 iorder[3] = 0;
4290 }
4291
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004292 /* On narrow builds we split characters outside the BMP into two
4293 codepoints => count how much extra space we need. */
4294#ifndef Py_UNICODE_WIDE
4295 for (qq = q; qq < e; qq += 4)
4296 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4297 pairs++;
4298#endif
4299
4300 /* This might be one to much, because of a BOM */
4301 unicode = _PyUnicode_New((size+3)/4+pairs);
4302 if (!unicode)
4303 return NULL;
4304 if (size == 0)
4305 return (PyObject *)unicode;
4306
4307 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004308 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004309
Walter Dörwald41980ca2007-08-16 21:55:45 +00004310 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 Py_UCS4 ch;
4312 /* remaining bytes at the end? (size should be divisible by 4) */
4313 if (e-q<4) {
4314 if (consumed)
4315 break;
4316 errmsg = "truncated data";
4317 startinpos = ((const char *)q)-starts;
4318 endinpos = ((const char *)e)-starts;
4319 goto utf32Error;
4320 /* The remaining input chars are ignored if the callback
4321 chooses to skip the input */
4322 }
4323 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4324 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004325
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 if (ch >= 0x110000)
4327 {
4328 errmsg = "codepoint not in range(0x110000)";
4329 startinpos = ((const char *)q)-starts;
4330 endinpos = startinpos+4;
4331 goto utf32Error;
4332 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004333#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 if (ch >= 0x10000)
4335 {
4336 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4337 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4338 }
4339 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004340#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 *p++ = ch;
4342 q += 4;
4343 continue;
4344 utf32Error:
4345 outpos = p-PyUnicode_AS_UNICODE(unicode);
4346 if (unicode_decode_call_errorhandler(
4347 errors, &errorHandler,
4348 "utf32", errmsg,
4349 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4350 &unicode, &outpos, &p))
4351 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004352 }
4353
4354 if (byteorder)
4355 *byteorder = bo;
4356
4357 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004359
4360 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004361 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004362 goto onError;
4363
4364 Py_XDECREF(errorHandler);
4365 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004366 if (PyUnicode_READY(unicode) == -1) {
4367 Py_DECREF(unicode);
4368 return NULL;
4369 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370 return (PyObject *)unicode;
4371
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004373 Py_DECREF(unicode);
4374 Py_XDECREF(errorHandler);
4375 Py_XDECREF(exc);
4376 return NULL;
4377}
4378
4379PyObject *
4380PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 Py_ssize_t size,
4382 const char *errors,
4383 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004384{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004385 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004386 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004387 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004388#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004389 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004390#else
4391 const int pairs = 0;
4392#endif
4393 /* Offsets from p for storing byte pairs in the right order. */
4394#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4395 int iorder[] = {0, 1, 2, 3};
4396#else
4397 int iorder[] = {3, 2, 1, 0};
4398#endif
4399
Benjamin Peterson29060642009-01-31 22:14:21 +00004400#define STORECHAR(CH) \
4401 do { \
4402 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4403 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4404 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4405 p[iorder[0]] = (CH) & 0xff; \
4406 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004407 } while(0)
4408
4409 /* In narrow builds we can output surrogate pairs as one codepoint,
4410 so we need less space. */
4411#ifndef Py_UNICODE_WIDE
4412 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4414 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4415 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004416#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004417 nsize = (size - pairs + (byteorder == 0));
4418 bytesize = nsize * 4;
4419 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004421 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004422 if (v == NULL)
4423 return NULL;
4424
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004425 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004426 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004428 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004429 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004430
4431 if (byteorder == -1) {
4432 /* force LE */
4433 iorder[0] = 0;
4434 iorder[1] = 1;
4435 iorder[2] = 2;
4436 iorder[3] = 3;
4437 }
4438 else if (byteorder == 1) {
4439 /* force BE */
4440 iorder[0] = 3;
4441 iorder[1] = 2;
4442 iorder[2] = 1;
4443 iorder[3] = 0;
4444 }
4445
4446 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004448#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4450 Py_UCS4 ch2 = *s;
4451 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4452 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4453 s++;
4454 size--;
4455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004456 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004457#endif
4458 STORECHAR(ch);
4459 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004460
4461 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004462 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004463#undef STORECHAR
4464}
4465
Alexander Belopolsky40018472011-02-26 01:02:56 +00004466PyObject *
4467PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004468{
4469 if (!PyUnicode_Check(unicode)) {
4470 PyErr_BadArgument();
4471 return NULL;
4472 }
4473 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 PyUnicode_GET_SIZE(unicode),
4475 NULL,
4476 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004477}
4478
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479/* --- UTF-16 Codec ------------------------------------------------------- */
4480
Tim Peters772747b2001-08-09 22:21:55 +00004481PyObject *
4482PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 Py_ssize_t size,
4484 const char *errors,
4485 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486{
Walter Dörwald69652032004-09-07 20:24:22 +00004487 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4488}
4489
Antoine Pitrouab868312009-01-10 15:40:25 +00004490/* Two masks for fast checking of whether a C 'long' may contain
4491 UTF16-encoded surrogate characters. This is an efficient heuristic,
4492 assuming that non-surrogate characters with a code point >= 0x8000 are
4493 rare in most input.
4494 FAST_CHAR_MASK is used when the input is in native byte ordering,
4495 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004496*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004497#if (SIZEOF_LONG == 8)
4498# define FAST_CHAR_MASK 0x8000800080008000L
4499# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4500#elif (SIZEOF_LONG == 4)
4501# define FAST_CHAR_MASK 0x80008000L
4502# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4503#else
4504# error C 'long' size should be either 4 or 8!
4505#endif
4506
Walter Dörwald69652032004-09-07 20:24:22 +00004507PyObject *
4508PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 Py_ssize_t size,
4510 const char *errors,
4511 int *byteorder,
4512 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004513{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004515 Py_ssize_t startinpos;
4516 Py_ssize_t endinpos;
4517 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 PyUnicodeObject *unicode;
4519 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004520 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004521 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004522 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004523 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004524 /* Offsets from q for retrieving byte pairs in the right order. */
4525#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4526 int ihi = 1, ilo = 0;
4527#else
4528 int ihi = 0, ilo = 1;
4529#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 PyObject *errorHandler = NULL;
4531 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532
4533 /* Note: size will always be longer than the resulting Unicode
4534 character count */
4535 unicode = _PyUnicode_New(size);
4536 if (!unicode)
4537 return NULL;
4538 if (size == 0)
4539 return (PyObject *)unicode;
4540
4541 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004542 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004543 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004544 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545
4546 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004547 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004549 /* Check for BOM marks (U+FEFF) in the input and adjust current
4550 byte order setting accordingly. In native mode, the leading BOM
4551 mark is skipped, in all other modes, it is copied to the output
4552 stream as-is (giving a ZWNBSP character). */
4553 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004554 if (size >= 2) {
4555 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004556#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004557 if (bom == 0xFEFF) {
4558 q += 2;
4559 bo = -1;
4560 }
4561 else if (bom == 0xFFFE) {
4562 q += 2;
4563 bo = 1;
4564 }
Tim Petersced69f82003-09-16 20:30:58 +00004565#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004566 if (bom == 0xFEFF) {
4567 q += 2;
4568 bo = 1;
4569 }
4570 else if (bom == 0xFFFE) {
4571 q += 2;
4572 bo = -1;
4573 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004574#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577
Tim Peters772747b2001-08-09 22:21:55 +00004578 if (bo == -1) {
4579 /* force LE */
4580 ihi = 1;
4581 ilo = 0;
4582 }
4583 else if (bo == 1) {
4584 /* force BE */
4585 ihi = 0;
4586 ilo = 1;
4587 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004588#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4589 native_ordering = ilo < ihi;
4590#else
4591 native_ordering = ilo > ihi;
4592#endif
Tim Peters772747b2001-08-09 22:21:55 +00004593
Antoine Pitrouab868312009-01-10 15:40:25 +00004594 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004595 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004597 /* First check for possible aligned read of a C 'long'. Unaligned
4598 reads are more expensive, better to defer to another iteration. */
4599 if (!((size_t) q & LONG_PTR_MASK)) {
4600 /* Fast path for runs of non-surrogate chars. */
4601 register const unsigned char *_q = q;
4602 Py_UNICODE *_p = p;
4603 if (native_ordering) {
4604 /* Native ordering is simple: as long as the input cannot
4605 possibly contain a surrogate char, do an unrolled copy
4606 of several 16-bit code points to the target object.
4607 The non-surrogate check is done on several input bytes
4608 at a time (as many as a C 'long' can contain). */
4609 while (_q < aligned_end) {
4610 unsigned long data = * (unsigned long *) _q;
4611 if (data & FAST_CHAR_MASK)
4612 break;
4613 _p[0] = ((unsigned short *) _q)[0];
4614 _p[1] = ((unsigned short *) _q)[1];
4615#if (SIZEOF_LONG == 8)
4616 _p[2] = ((unsigned short *) _q)[2];
4617 _p[3] = ((unsigned short *) _q)[3];
4618#endif
4619 _q += SIZEOF_LONG;
4620 _p += SIZEOF_LONG / 2;
4621 }
4622 }
4623 else {
4624 /* Byteswapped ordering is similar, but we must decompose
4625 the copy bytewise, and take care of zero'ing out the
4626 upper bytes if the target object is in 32-bit units
4627 (that is, in UCS-4 builds). */
4628 while (_q < aligned_end) {
4629 unsigned long data = * (unsigned long *) _q;
4630 if (data & SWAPPED_FAST_CHAR_MASK)
4631 break;
4632 /* Zero upper bytes in UCS-4 builds */
4633#if (Py_UNICODE_SIZE > 2)
4634 _p[0] = 0;
4635 _p[1] = 0;
4636#if (SIZEOF_LONG == 8)
4637 _p[2] = 0;
4638 _p[3] = 0;
4639#endif
4640#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004641 /* Issue #4916; UCS-4 builds on big endian machines must
4642 fill the two last bytes of each 4-byte unit. */
4643#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4644# define OFF 2
4645#else
4646# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004647#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004648 ((unsigned char *) _p)[OFF + 1] = _q[0];
4649 ((unsigned char *) _p)[OFF + 0] = _q[1];
4650 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4651 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4652#if (SIZEOF_LONG == 8)
4653 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4654 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4655 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4656 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4657#endif
4658#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004659 _q += SIZEOF_LONG;
4660 _p += SIZEOF_LONG / 2;
4661 }
4662 }
4663 p = _p;
4664 q = _q;
4665 if (q >= e)
4666 break;
4667 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669
Benjamin Peterson14339b62009-01-31 16:36:08 +00004670 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004671
4672 if (ch < 0xD800 || ch > 0xDFFF) {
4673 *p++ = ch;
4674 continue;
4675 }
4676
4677 /* UTF-16 code pair: */
4678 if (q > e) {
4679 errmsg = "unexpected end of data";
4680 startinpos = (((const char *)q) - 2) - starts;
4681 endinpos = ((const char *)e) + 1 - starts;
4682 goto utf16Error;
4683 }
4684 if (0xD800 <= ch && ch <= 0xDBFF) {
4685 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4686 q += 2;
4687 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004688#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 *p++ = ch;
4690 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004691#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004693#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 continue;
4695 }
4696 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004697 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 startinpos = (((const char *)q)-4)-starts;
4699 endinpos = startinpos+2;
4700 goto utf16Error;
4701 }
4702
Benjamin Peterson14339b62009-01-31 16:36:08 +00004703 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 errmsg = "illegal encoding";
4705 startinpos = (((const char *)q)-2)-starts;
4706 endinpos = startinpos+2;
4707 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004708
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 utf16Error:
4710 outpos = p - PyUnicode_AS_UNICODE(unicode);
4711 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004712 errors,
4713 &errorHandler,
4714 "utf16", errmsg,
4715 &starts,
4716 (const char **)&e,
4717 &startinpos,
4718 &endinpos,
4719 &exc,
4720 (const char **)&q,
4721 &unicode,
4722 &outpos,
4723 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004726 /* remaining byte at the end? (size should be even) */
4727 if (e == q) {
4728 if (!consumed) {
4729 errmsg = "truncated data";
4730 startinpos = ((const char *)q) - starts;
4731 endinpos = ((const char *)e) + 1 - starts;
4732 outpos = p - PyUnicode_AS_UNICODE(unicode);
4733 if (unicode_decode_call_errorhandler(
4734 errors,
4735 &errorHandler,
4736 "utf16", errmsg,
4737 &starts,
4738 (const char **)&e,
4739 &startinpos,
4740 &endinpos,
4741 &exc,
4742 (const char **)&q,
4743 &unicode,
4744 &outpos,
4745 &p))
4746 goto onError;
4747 /* The remaining input chars are ignored if the callback
4748 chooses to skip the input */
4749 }
4750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751
4752 if (byteorder)
4753 *byteorder = bo;
4754
Walter Dörwald69652032004-09-07 20:24:22 +00004755 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004757
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004759 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 goto onError;
4761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 Py_XDECREF(errorHandler);
4763 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004764 if (PyUnicode_READY(unicode) == -1) {
4765 Py_DECREF(unicode);
4766 return NULL;
4767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 return (PyObject *)unicode;
4769
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 Py_XDECREF(errorHandler);
4773 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 return NULL;
4775}
4776
Antoine Pitrouab868312009-01-10 15:40:25 +00004777#undef FAST_CHAR_MASK
4778#undef SWAPPED_FAST_CHAR_MASK
4779
Tim Peters772747b2001-08-09 22:21:55 +00004780PyObject *
4781PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 Py_ssize_t size,
4783 const char *errors,
4784 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004786 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004787 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004788 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004789#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004790 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004791#else
4792 const int pairs = 0;
4793#endif
Tim Peters772747b2001-08-09 22:21:55 +00004794 /* Offsets from p for storing byte pairs in the right order. */
4795#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4796 int ihi = 1, ilo = 0;
4797#else
4798 int ihi = 0, ilo = 1;
4799#endif
4800
Benjamin Peterson29060642009-01-31 22:14:21 +00004801#define STORECHAR(CH) \
4802 do { \
4803 p[ihi] = ((CH) >> 8) & 0xff; \
4804 p[ilo] = (CH) & 0xff; \
4805 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004806 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004808#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004809 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 if (s[i] >= 0x10000)
4811 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004812#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004813 /* 2 * (size + pairs + (byteorder == 0)) */
4814 if (size > PY_SSIZE_T_MAX ||
4815 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004817 nsize = size + pairs + (byteorder == 0);
4818 bytesize = nsize * 2;
4819 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004821 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 if (v == NULL)
4823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004825 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004827 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004828 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004829 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004830
4831 if (byteorder == -1) {
4832 /* force LE */
4833 ihi = 1;
4834 ilo = 0;
4835 }
4836 else if (byteorder == 1) {
4837 /* force BE */
4838 ihi = 0;
4839 ilo = 1;
4840 }
4841
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004842 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 Py_UNICODE ch = *s++;
4844 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004845#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 if (ch >= 0x10000) {
4847 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4848 ch = 0xD800 | ((ch-0x10000) >> 10);
4849 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004850#endif
Tim Peters772747b2001-08-09 22:21:55 +00004851 STORECHAR(ch);
4852 if (ch2)
4853 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004854 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004855
4856 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004857 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004858#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859}
4860
Alexander Belopolsky40018472011-02-26 01:02:56 +00004861PyObject *
4862PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863{
4864 if (!PyUnicode_Check(unicode)) {
4865 PyErr_BadArgument();
4866 return NULL;
4867 }
4868 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 PyUnicode_GET_SIZE(unicode),
4870 NULL,
4871 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872}
4873
4874/* --- Unicode Escape Codec ----------------------------------------------- */
4875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4877 if all the escapes in the string make it still a valid ASCII string.
4878 Returns -1 if any escapes were found which cause the string to
4879 pop out of ASCII range. Otherwise returns the length of the
4880 required buffer to hold the string.
4881 */
4882Py_ssize_t
4883length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4884{
4885 const unsigned char *p = (const unsigned char *)s;
4886 const unsigned char *end = p + size;
4887 Py_ssize_t length = 0;
4888
4889 if (size < 0)
4890 return -1;
4891
4892 for (; p < end; ++p) {
4893 if (*p > 127) {
4894 /* Non-ASCII */
4895 return -1;
4896 }
4897 else if (*p != '\\') {
4898 /* Normal character */
4899 ++length;
4900 }
4901 else {
4902 /* Backslash-escape, check next char */
4903 ++p;
4904 /* Escape sequence reaches till end of string or
4905 non-ASCII follow-up. */
4906 if (p >= end || *p > 127)
4907 return -1;
4908 switch (*p) {
4909 case '\n':
4910 /* backslash + \n result in zero characters */
4911 break;
4912 case '\\': case '\'': case '\"':
4913 case 'b': case 'f': case 't':
4914 case 'n': case 'r': case 'v': case 'a':
4915 ++length;
4916 break;
4917 case '0': case '1': case '2': case '3':
4918 case '4': case '5': case '6': case '7':
4919 case 'x': case 'u': case 'U': case 'N':
4920 /* these do not guarantee ASCII characters */
4921 return -1;
4922 default:
4923 /* count the backslash + the other character */
4924 length += 2;
4925 }
4926 }
4927 }
4928 return length;
4929}
4930
4931/* Similar to PyUnicode_WRITE but either write into wstr field
4932 or treat string as ASCII. */
4933#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4934 do { \
4935 if ((kind) != PyUnicode_WCHAR_KIND) \
4936 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4937 else \
4938 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4939 } while (0)
4940
4941#define WRITE_WSTR(buf, index, value) \
4942 assert(kind == PyUnicode_WCHAR_KIND), \
4943 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4944
4945
Fredrik Lundh06d12682001-01-24 07:59:11 +00004946static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004947
Alexander Belopolsky40018472011-02-26 01:02:56 +00004948PyObject *
4949PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004950 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004954 Py_ssize_t startinpos;
4955 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004956 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004960 char* message;
4961 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004962 PyObject *errorHandler = NULL;
4963 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004964 Py_ssize_t ascii_length;
4965 Py_ssize_t i;
4966 int kind;
4967 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004969 ascii_length = length_of_escaped_ascii_string(s, size);
4970
4971 /* After length_of_escaped_ascii_string() there are two alternatives,
4972 either the string is pure ASCII with named escapes like \n, etc.
4973 and we determined it's exact size (common case)
4974 or it contains \x, \u, ... escape sequences. then we create a
4975 legacy wchar string and resize it at the end of this function. */
4976 if (ascii_length >= 0) {
4977 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4978 if (!v)
4979 goto onError;
4980 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4981 kind = PyUnicode_1BYTE_KIND;
4982 data = PyUnicode_DATA(v);
4983 }
4984 else {
4985 /* Escaped strings will always be longer than the resulting
4986 Unicode string, so we start with size here and then reduce the
4987 length after conversion to the true value.
4988 (but if the error callback returns a long replacement string
4989 we'll have to allocate more space) */
4990 v = _PyUnicode_New(size);
4991 if (!v)
4992 goto onError;
4993 kind = PyUnicode_WCHAR_KIND;
4994 data = PyUnicode_AS_UNICODE(v);
4995 }
4996
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 if (size == 0)
4998 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004999 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005001
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 while (s < end) {
5003 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005004 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005005 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005007 if (kind == PyUnicode_WCHAR_KIND) {
5008 assert(i < _PyUnicode_WSTR_LENGTH(v));
5009 }
5010 else {
5011 /* The only case in which i == ascii_length is a backslash
5012 followed by a newline. */
5013 assert(i <= ascii_length);
5014 }
5015
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016 /* Non-escape characters are interpreted as Unicode ordinals */
5017 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005018 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019 continue;
5020 }
5021
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023 /* \ - Escapes */
5024 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005025 c = *s++;
5026 if (s > end)
5027 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005028
5029 if (kind == PyUnicode_WCHAR_KIND) {
5030 assert(i < _PyUnicode_WSTR_LENGTH(v));
5031 }
5032 else {
5033 /* The only case in which i == ascii_length is a backslash
5034 followed by a newline. */
5035 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5036 }
5037
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005038 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005042 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5043 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5044 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5045 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5046 /* FF */
5047 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5048 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5049 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5050 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5051 /* VT */
5052 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5053 /* BEL, not classic C */
5054 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 case '0': case '1': case '2': case '3':
5058 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005059 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005060 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005061 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005062 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005063 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005065 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066 break;
5067
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 /* hex escapes */
5069 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005071 digits = 2;
5072 message = "truncated \\xXX escape";
5073 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005077 digits = 4;
5078 message = "truncated \\uXXXX escape";
5079 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005082 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005083 digits = 8;
5084 message = "truncated \\UXXXXXXXX escape";
5085 hexescape:
5086 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005087 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 if (s+digits>end) {
5089 endinpos = size;
5090 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 errors, &errorHandler,
5092 "unicodeescape", "end of string in escape sequence",
5093 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005094 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005095 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005096 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 goto nextByte;
5098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005099 for (j = 0; j < digits; ++j) {
5100 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005101 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005102 endinpos = (s+j+1)-starts;
5103 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 errors, &errorHandler,
5106 "unicodeescape", message,
5107 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005108 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005109 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005110 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005111 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005112 }
5113 chr = (chr<<4) & ~0xF;
5114 if (c >= '0' && c <= '9')
5115 chr += c - '0';
5116 else if (c >= 'a' && c <= 'f')
5117 chr += 10 + c - 'a';
5118 else
5119 chr += 10 + c - 'A';
5120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005121 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005122 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 /* _decoding_error will have already written into the
5124 target buffer. */
5125 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005126 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005127 /* when we get here, chr is a 32-bit unicode character */
5128 if (chr <= 0xffff)
5129 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005130 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005131 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005132 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005133 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005134#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005135 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005136#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005137 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005138 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5139 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005140#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005141 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005143 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005144 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 errors, &errorHandler,
5146 "unicodeescape", "illegal Unicode character",
5147 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005148 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005149 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005150 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005151 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005152 break;
5153
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005155 case 'N':
5156 message = "malformed \\N character escape";
5157 if (ucnhash_CAPI == NULL) {
5158 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005159 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5160 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005161 if (ucnhash_CAPI == NULL)
5162 goto ucnhashError;
5163 }
5164 if (*s == '{') {
5165 const char *start = s+1;
5166 /* look for the closing brace */
5167 while (*s != '}' && s < end)
5168 s++;
5169 if (s > start && s < end && *s == '}') {
5170 /* found a name. look it up in the unicode database */
5171 message = "unknown Unicode character name";
5172 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005173 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5174 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005175 goto store;
5176 }
5177 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005178 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005179 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 errors, &errorHandler,
5182 "unicodeescape", message,
5183 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005184 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005185 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005186 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005187 break;
5188
5189 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005190 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005191 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005192 message = "\\ at end of string";
5193 s--;
5194 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005195 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 errors, &errorHandler,
5198 "unicodeescape", message,
5199 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005200 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005201 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005202 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005203 }
5204 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005205 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5206 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005207 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005208 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 /* Ensure the length prediction worked in case of ASCII strings */
5214 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5215
5216 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5217 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005219 Py_XDECREF(errorHandler);
5220 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005222
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005224 PyErr_SetString(
5225 PyExc_UnicodeError,
5226 "\\N escapes not supported (can't load unicodedata module)"
5227 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005228 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 Py_XDECREF(errorHandler);
5230 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005231 return NULL;
5232
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 Py_XDECREF(errorHandler);
5236 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 return NULL;
5238}
5239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005240#undef WRITE_ASCII_OR_WSTR
5241#undef WRITE_WSTR
5242
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243/* Return a Unicode-Escape string version of the Unicode object.
5244
5245 If quotes is true, the string is enclosed in u"" or u'' quotes as
5246 appropriate.
5247
5248*/
5249
Walter Dörwald79e913e2007-05-12 11:08:06 +00005250static const char *hexdigits = "0123456789abcdef";
5251
Alexander Belopolsky40018472011-02-26 01:02:56 +00005252PyObject *
5253PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005254 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005256 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005259#ifdef Py_UNICODE_WIDE
5260 const Py_ssize_t expandsize = 10;
5261#else
5262 const Py_ssize_t expandsize = 6;
5263#endif
5264
Thomas Wouters89f507f2006-12-13 04:49:30 +00005265 /* XXX(nnorwitz): rather than over-allocating, it would be
5266 better to choose a different scheme. Perhaps scan the
5267 first N-chars of the string and allocate based on that size.
5268 */
5269 /* Initial allocation is based on the longest-possible unichr
5270 escape.
5271
5272 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5273 unichr, so in this case it's the longest unichr escape. In
5274 narrow (UTF-16) builds this is five chars per source unichr
5275 since there are two unichrs in the surrogate pair, so in narrow
5276 (UTF-16) builds it's not the longest unichr escape.
5277
5278 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5279 so in the narrow (UTF-16) build case it's the longest unichr
5280 escape.
5281 */
5282
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005283 if (size == 0)
5284 return PyBytes_FromStringAndSize(NULL, 0);
5285
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005286 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005288
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005289 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 2
5291 + expandsize*size
5292 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 if (repr == NULL)
5294 return NULL;
5295
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005296 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 while (size-- > 0) {
5299 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005300
Walter Dörwald79e913e2007-05-12 11:08:06 +00005301 /* Escape backslashes */
5302 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 *p++ = '\\';
5304 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005305 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005306 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005307
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005308#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005309 /* Map 21-bit characters to '\U00xxxxxx' */
5310 else if (ch >= 0x10000) {
5311 *p++ = '\\';
5312 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005313 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5314 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5315 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5316 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5317 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5318 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5319 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5320 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005322 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005323#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5325 else if (ch >= 0xD800 && ch < 0xDC00) {
5326 Py_UNICODE ch2;
5327 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005328
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 ch2 = *s++;
5330 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005331 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5333 *p++ = '\\';
5334 *p++ = 'U';
5335 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5336 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5337 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5338 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5339 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5340 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5341 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5342 *p++ = hexdigits[ucs & 0x0000000F];
5343 continue;
5344 }
5345 /* Fall through: isolated surrogates are copied as-is */
5346 s--;
5347 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005349#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005350
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005352 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 *p++ = '\\';
5354 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005355 *p++ = hexdigits[(ch >> 12) & 0x000F];
5356 *p++ = hexdigits[(ch >> 8) & 0x000F];
5357 *p++ = hexdigits[(ch >> 4) & 0x000F];
5358 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005360
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005361 /* Map special whitespace to '\t', \n', '\r' */
5362 else if (ch == '\t') {
5363 *p++ = '\\';
5364 *p++ = 't';
5365 }
5366 else if (ch == '\n') {
5367 *p++ = '\\';
5368 *p++ = 'n';
5369 }
5370 else if (ch == '\r') {
5371 *p++ = '\\';
5372 *p++ = 'r';
5373 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005374
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005375 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005376 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005378 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005379 *p++ = hexdigits[(ch >> 4) & 0x000F];
5380 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005381 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005382
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 /* Copy everything else as-is */
5384 else
5385 *p++ = (char) ch;
5386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005388 assert(p - PyBytes_AS_STRING(repr) > 0);
5389 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5390 return NULL;
5391 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392}
5393
Alexander Belopolsky40018472011-02-26 01:02:56 +00005394PyObject *
5395PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005397 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 if (!PyUnicode_Check(unicode)) {
5399 PyErr_BadArgument();
5400 return NULL;
5401 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005402 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5403 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005404 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405}
5406
5407/* --- Raw Unicode Escape Codec ------------------------------------------- */
5408
Alexander Belopolsky40018472011-02-26 01:02:56 +00005409PyObject *
5410PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005411 Py_ssize_t size,
5412 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005414 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005415 Py_ssize_t startinpos;
5416 Py_ssize_t endinpos;
5417 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 const char *end;
5421 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005422 PyObject *errorHandler = NULL;
5423 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 /* Escaped strings will always be longer than the resulting
5426 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 length after conversion to the true value. (But decoding error
5428 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 v = _PyUnicode_New(size);
5430 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 end = s + size;
5436 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 unsigned char c;
5438 Py_UCS4 x;
5439 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005440 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 /* Non-escape characters are interpreted as Unicode ordinals */
5443 if (*s != '\\') {
5444 *p++ = (unsigned char)*s++;
5445 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005446 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 startinpos = s-starts;
5448
5449 /* \u-escapes are only interpreted iff the number of leading
5450 backslashes if odd */
5451 bs = s;
5452 for (;s < end;) {
5453 if (*s != '\\')
5454 break;
5455 *p++ = (unsigned char)*s++;
5456 }
5457 if (((s - bs) & 1) == 0 ||
5458 s >= end ||
5459 (*s != 'u' && *s != 'U')) {
5460 continue;
5461 }
5462 p--;
5463 count = *s=='u' ? 4 : 8;
5464 s++;
5465
5466 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5467 outpos = p-PyUnicode_AS_UNICODE(v);
5468 for (x = 0, i = 0; i < count; ++i, ++s) {
5469 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005470 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 endinpos = s-starts;
5472 if (unicode_decode_call_errorhandler(
5473 errors, &errorHandler,
5474 "rawunicodeescape", "truncated \\uXXXX",
5475 &starts, &end, &startinpos, &endinpos, &exc, &s,
5476 &v, &outpos, &p))
5477 goto onError;
5478 goto nextByte;
5479 }
5480 x = (x<<4) & ~0xF;
5481 if (c >= '0' && c <= '9')
5482 x += c - '0';
5483 else if (c >= 'a' && c <= 'f')
5484 x += 10 + c - 'a';
5485 else
5486 x += 10 + c - 'A';
5487 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005488 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 /* UCS-2 character */
5490 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005491 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 /* UCS-4 character. Either store directly, or as
5493 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005494#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005496#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 x -= 0x10000L;
5498 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5499 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005500#endif
5501 } else {
5502 endinpos = s-starts;
5503 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005504 if (unicode_decode_call_errorhandler(
5505 errors, &errorHandler,
5506 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 &starts, &end, &startinpos, &endinpos, &exc, &s,
5508 &v, &outpos, &p))
5509 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 nextByte:
5512 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005514 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 Py_XDECREF(errorHandler);
5517 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518 if (PyUnicode_READY(v) == -1) {
5519 Py_DECREF(v);
5520 return NULL;
5521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005523
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 Py_XDECREF(errorHandler);
5527 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 return NULL;
5529}
5530
Alexander Belopolsky40018472011-02-26 01:02:56 +00005531PyObject *
5532PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005533 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005535 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 char *p;
5537 char *q;
5538
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005539#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005540 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005541#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005542 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005543#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005544
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005545 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005547
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005548 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 if (repr == NULL)
5550 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005551 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005552 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005554 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 while (size-- > 0) {
5556 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005557#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 /* Map 32-bit characters to '\Uxxxxxxxx' */
5559 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005560 *p++ = '\\';
5561 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005562 *p++ = hexdigits[(ch >> 28) & 0xf];
5563 *p++ = hexdigits[(ch >> 24) & 0xf];
5564 *p++ = hexdigits[(ch >> 20) & 0xf];
5565 *p++ = hexdigits[(ch >> 16) & 0xf];
5566 *p++ = hexdigits[(ch >> 12) & 0xf];
5567 *p++ = hexdigits[(ch >> 8) & 0xf];
5568 *p++ = hexdigits[(ch >> 4) & 0xf];
5569 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005570 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005571 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005572#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5574 if (ch >= 0xD800 && ch < 0xDC00) {
5575 Py_UNICODE ch2;
5576 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005577
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 ch2 = *s++;
5579 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005580 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5582 *p++ = '\\';
5583 *p++ = 'U';
5584 *p++ = hexdigits[(ucs >> 28) & 0xf];
5585 *p++ = hexdigits[(ucs >> 24) & 0xf];
5586 *p++ = hexdigits[(ucs >> 20) & 0xf];
5587 *p++ = hexdigits[(ucs >> 16) & 0xf];
5588 *p++ = hexdigits[(ucs >> 12) & 0xf];
5589 *p++ = hexdigits[(ucs >> 8) & 0xf];
5590 *p++ = hexdigits[(ucs >> 4) & 0xf];
5591 *p++ = hexdigits[ucs & 0xf];
5592 continue;
5593 }
5594 /* Fall through: isolated surrogates are copied as-is */
5595 s--;
5596 size++;
5597 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005598#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 /* Map 16-bit characters to '\uxxxx' */
5600 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 *p++ = '\\';
5602 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005603 *p++ = hexdigits[(ch >> 12) & 0xf];
5604 *p++ = hexdigits[(ch >> 8) & 0xf];
5605 *p++ = hexdigits[(ch >> 4) & 0xf];
5606 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 /* Copy everything else as-is */
5609 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 *p++ = (char) ch;
5611 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005612 size = p - q;
5613
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005614 assert(size > 0);
5615 if (_PyBytes_Resize(&repr, size) < 0)
5616 return NULL;
5617 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618}
5619
Alexander Belopolsky40018472011-02-26 01:02:56 +00005620PyObject *
5621PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005623 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005625 PyErr_BadArgument();
5626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005628 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5629 PyUnicode_GET_SIZE(unicode));
5630
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005631 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632}
5633
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005634/* --- Unicode Internal Codec ------------------------------------------- */
5635
Alexander Belopolsky40018472011-02-26 01:02:56 +00005636PyObject *
5637_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005638 Py_ssize_t size,
5639 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005640{
5641 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 Py_ssize_t startinpos;
5643 Py_ssize_t endinpos;
5644 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005645 PyUnicodeObject *v;
5646 Py_UNICODE *p;
5647 const char *end;
5648 const char *reason;
5649 PyObject *errorHandler = NULL;
5650 PyObject *exc = NULL;
5651
Neal Norwitzd43069c2006-01-08 01:12:10 +00005652#ifdef Py_UNICODE_WIDE
5653 Py_UNICODE unimax = PyUnicode_GetMax();
5654#endif
5655
Thomas Wouters89f507f2006-12-13 04:49:30 +00005656 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005657 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5658 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005660 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5661 as string was created with the old API. */
5662 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005664 p = PyUnicode_AS_UNICODE(v);
5665 end = s + size;
5666
5667 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005668 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005669 /* We have to sanity check the raw data, otherwise doom looms for
5670 some malformed UCS-4 data. */
5671 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005672#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005673 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005674#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005675 end-s < Py_UNICODE_SIZE
5676 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005678 startinpos = s - starts;
5679 if (end-s < Py_UNICODE_SIZE) {
5680 endinpos = end-starts;
5681 reason = "truncated input";
5682 }
5683 else {
5684 endinpos = s - starts + Py_UNICODE_SIZE;
5685 reason = "illegal code point (> 0x10FFFF)";
5686 }
5687 outpos = p - PyUnicode_AS_UNICODE(v);
5688 if (unicode_decode_call_errorhandler(
5689 errors, &errorHandler,
5690 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005691 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005692 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005693 goto onError;
5694 }
5695 }
5696 else {
5697 p++;
5698 s += Py_UNICODE_SIZE;
5699 }
5700 }
5701
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005702 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005703 goto onError;
5704 Py_XDECREF(errorHandler);
5705 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005706 if (PyUnicode_READY(v) == -1) {
5707 Py_DECREF(v);
5708 return NULL;
5709 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005710 return (PyObject *)v;
5711
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005713 Py_XDECREF(v);
5714 Py_XDECREF(errorHandler);
5715 Py_XDECREF(exc);
5716 return NULL;
5717}
5718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719/* --- Latin-1 Codec ------------------------------------------------------ */
5720
Alexander Belopolsky40018472011-02-26 01:02:56 +00005721PyObject *
5722PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005723 Py_ssize_t size,
5724 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005727 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728}
5729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005731static void
5732make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005733 const char *encoding,
5734 const Py_UNICODE *unicode, Py_ssize_t size,
5735 Py_ssize_t startpos, Py_ssize_t endpos,
5736 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 *exceptionObject = PyUnicodeEncodeError_Create(
5740 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 }
5742 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5744 goto onError;
5745 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5746 goto onError;
5747 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5748 goto onError;
5749 return;
5750 onError:
5751 Py_DECREF(*exceptionObject);
5752 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 }
5754}
5755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005757static void
5758raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005759 const char *encoding,
5760 const Py_UNICODE *unicode, Py_ssize_t size,
5761 Py_ssize_t startpos, Py_ssize_t endpos,
5762 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763{
5764 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768}
5769
5770/* error handling callback helper:
5771 build arguments, call the callback and check the arguments,
5772 put the result into newpos and return the replacement string, which
5773 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005774static PyObject *
5775unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005776 PyObject **errorHandler,
5777 const char *encoding, const char *reason,
5778 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5779 Py_ssize_t startpos, Py_ssize_t endpos,
5780 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005782 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783
5784 PyObject *restuple;
5785 PyObject *resunicode;
5786
5787 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 }
5792
5793 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797
5798 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005803 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 Py_DECREF(restuple);
5805 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005807 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 &resunicode, newpos)) {
5809 Py_DECREF(restuple);
5810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005812 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5813 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5814 Py_DECREF(restuple);
5815 return NULL;
5816 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005819 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5821 Py_DECREF(restuple);
5822 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005823 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 Py_INCREF(resunicode);
5825 Py_DECREF(restuple);
5826 return resunicode;
5827}
5828
Alexander Belopolsky40018472011-02-26 01:02:56 +00005829static PyObject *
5830unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005831 Py_ssize_t size,
5832 const char *errors,
5833 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834{
5835 /* output object */
5836 PyObject *res;
5837 /* pointers to the beginning and end+1 of input */
5838 const Py_UNICODE *startp = p;
5839 const Py_UNICODE *endp = p + size;
5840 /* pointer to the beginning of the unencodable characters */
5841 /* const Py_UNICODE *badp = NULL; */
5842 /* pointer into the output */
5843 char *str;
5844 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005845 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005846 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5847 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 PyObject *errorHandler = NULL;
5849 PyObject *exc = NULL;
5850 /* the following variable is used for caching string comparisons
5851 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5852 int known_errorHandler = -1;
5853
5854 /* allocate enough for a simple encoding without
5855 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005856 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005857 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005858 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005860 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005861 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 ressize = size;
5863
5864 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 /* can we encode this? */
5868 if (c<limit) {
5869 /* no overflow check, because we know that the space is enough */
5870 *str++ = (char)c;
5871 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005872 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 else {
5874 Py_ssize_t unicodepos = p-startp;
5875 Py_ssize_t requiredsize;
5876 PyObject *repunicode;
5877 Py_ssize_t repsize;
5878 Py_ssize_t newpos;
5879 Py_ssize_t respos;
5880 Py_UNICODE *uni2;
5881 /* startpos for collecting unencodable chars */
5882 const Py_UNICODE *collstart = p;
5883 const Py_UNICODE *collend = p;
5884 /* find all unecodable characters */
5885 while ((collend < endp) && ((*collend)>=limit))
5886 ++collend;
5887 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5888 if (known_errorHandler==-1) {
5889 if ((errors==NULL) || (!strcmp(errors, "strict")))
5890 known_errorHandler = 1;
5891 else if (!strcmp(errors, "replace"))
5892 known_errorHandler = 2;
5893 else if (!strcmp(errors, "ignore"))
5894 known_errorHandler = 3;
5895 else if (!strcmp(errors, "xmlcharrefreplace"))
5896 known_errorHandler = 4;
5897 else
5898 known_errorHandler = 0;
5899 }
5900 switch (known_errorHandler) {
5901 case 1: /* strict */
5902 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5903 goto onError;
5904 case 2: /* replace */
5905 while (collstart++<collend)
5906 *str++ = '?'; /* fall through */
5907 case 3: /* ignore */
5908 p = collend;
5909 break;
5910 case 4: /* xmlcharrefreplace */
5911 respos = str - PyBytes_AS_STRING(res);
5912 /* determine replacement size (temporarily (mis)uses p) */
5913 for (p = collstart, repsize = 0; p < collend; ++p) {
5914 if (*p<10)
5915 repsize += 2+1+1;
5916 else if (*p<100)
5917 repsize += 2+2+1;
5918 else if (*p<1000)
5919 repsize += 2+3+1;
5920 else if (*p<10000)
5921 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005922#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 else
5924 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005925#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 else if (*p<100000)
5927 repsize += 2+5+1;
5928 else if (*p<1000000)
5929 repsize += 2+6+1;
5930 else
5931 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005932#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 }
5934 requiredsize = respos+repsize+(endp-collend);
5935 if (requiredsize > ressize) {
5936 if (requiredsize<2*ressize)
5937 requiredsize = 2*ressize;
5938 if (_PyBytes_Resize(&res, requiredsize))
5939 goto onError;
5940 str = PyBytes_AS_STRING(res) + respos;
5941 ressize = requiredsize;
5942 }
5943 /* generate replacement (temporarily (mis)uses p) */
5944 for (p = collstart; p < collend; ++p) {
5945 str += sprintf(str, "&#%d;", (int)*p);
5946 }
5947 p = collend;
5948 break;
5949 default:
5950 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5951 encoding, reason, startp, size, &exc,
5952 collstart-startp, collend-startp, &newpos);
5953 if (repunicode == NULL)
5954 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005955 if (PyBytes_Check(repunicode)) {
5956 /* Directly copy bytes result to output. */
5957 repsize = PyBytes_Size(repunicode);
5958 if (repsize > 1) {
5959 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005960 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005961 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5962 Py_DECREF(repunicode);
5963 goto onError;
5964 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005965 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005966 ressize += repsize-1;
5967 }
5968 memcpy(str, PyBytes_AsString(repunicode), repsize);
5969 str += repsize;
5970 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005971 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005972 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005973 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 /* need more space? (at least enough for what we
5975 have+the replacement+the rest of the string, so
5976 we won't have to check space for encodable characters) */
5977 respos = str - PyBytes_AS_STRING(res);
5978 repsize = PyUnicode_GET_SIZE(repunicode);
5979 requiredsize = respos+repsize+(endp-collend);
5980 if (requiredsize > ressize) {
5981 if (requiredsize<2*ressize)
5982 requiredsize = 2*ressize;
5983 if (_PyBytes_Resize(&res, requiredsize)) {
5984 Py_DECREF(repunicode);
5985 goto onError;
5986 }
5987 str = PyBytes_AS_STRING(res) + respos;
5988 ressize = requiredsize;
5989 }
5990 /* check if there is anything unencodable in the replacement
5991 and copy it to the output */
5992 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5993 c = *uni2;
5994 if (c >= limit) {
5995 raise_encode_exception(&exc, encoding, startp, size,
5996 unicodepos, unicodepos+1, reason);
5997 Py_DECREF(repunicode);
5998 goto onError;
5999 }
6000 *str = (char)c;
6001 }
6002 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006003 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006004 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006005 }
6006 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006007 /* Resize if we allocated to much */
6008 size = str - PyBytes_AS_STRING(res);
6009 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006010 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006011 if (_PyBytes_Resize(&res, size) < 0)
6012 goto onError;
6013 }
6014
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 Py_XDECREF(errorHandler);
6016 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006017 return res;
6018
6019 onError:
6020 Py_XDECREF(res);
6021 Py_XDECREF(errorHandler);
6022 Py_XDECREF(exc);
6023 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024}
6025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
6027PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006028 Py_ssize_t size,
6029 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032}
6033
Alexander Belopolsky40018472011-02-26 01:02:56 +00006034PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006035_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036{
6037 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 PyErr_BadArgument();
6039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006041 if (PyUnicode_READY(unicode) == -1)
6042 return NULL;
6043 /* Fast path: if it is a one-byte string, construct
6044 bytes object directly. */
6045 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6046 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6047 PyUnicode_GET_LENGTH(unicode));
6048 /* Non-Latin-1 characters present. Defer to above function to
6049 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006052 errors);
6053}
6054
6055PyObject*
6056PyUnicode_AsLatin1String(PyObject *unicode)
6057{
6058 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059}
6060
6061/* --- 7-bit ASCII Codec -------------------------------------------------- */
6062
Alexander Belopolsky40018472011-02-26 01:02:56 +00006063PyObject *
6064PyUnicode_DecodeASCII(const char *s,
6065 Py_ssize_t size,
6066 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 PyUnicodeObject *v;
6070 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006071 Py_ssize_t startinpos;
6072 Py_ssize_t endinpos;
6073 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006075 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 PyObject *errorHandler = NULL;
6077 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006078 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006081 if (size == 1 && *(unsigned char*)s < 128)
6082 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6083
6084 /* Fast path. Assume the input actually *is* ASCII, and allocate
6085 a single-block Unicode object with that assumption. If there is
6086 an error, drop the object and start over. */
6087 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6088 if (v == NULL)
6089 goto onError;
6090 d = PyUnicode_1BYTE_DATA(v);
6091 for (i = 0; i < size; i++) {
6092 unsigned char ch = ((unsigned char*)s)[i];
6093 if (ch < 128)
6094 d[i] = ch;
6095 else
6096 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006098 if (i == size)
6099 return (PyObject*)v;
6100 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006101
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 v = _PyUnicode_New(size);
6103 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 e = s + size;
6109 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 register unsigned char c = (unsigned char)*s;
6111 if (c < 128) {
6112 *p++ = c;
6113 ++s;
6114 }
6115 else {
6116 startinpos = s-starts;
6117 endinpos = startinpos + 1;
6118 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6119 if (unicode_decode_call_errorhandler(
6120 errors, &errorHandler,
6121 "ascii", "ordinal not in range(128)",
6122 &starts, &e, &startinpos, &endinpos, &exc, &s,
6123 &v, &outpos, &p))
6124 goto onError;
6125 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006127 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130 Py_XDECREF(errorHandler);
6131 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006132 if (PyUnicode_READY(v) == -1) {
6133 Py_DECREF(v);
6134 return NULL;
6135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006137
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 Py_XDECREF(errorHandler);
6141 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 return NULL;
6143}
6144
Alexander Belopolsky40018472011-02-26 01:02:56 +00006145PyObject *
6146PyUnicode_EncodeASCII(const Py_UNICODE *p,
6147 Py_ssize_t size,
6148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151}
6152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006154_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
6156 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 PyErr_BadArgument();
6158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006160 if (PyUnicode_READY(unicode) == -1)
6161 return NULL;
6162 /* Fast path: if it is an ASCII-only string, construct bytes object
6163 directly. Else defer to above function to raise the exception. */
6164 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6165 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6166 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006169 errors);
6170}
6171
6172PyObject *
6173PyUnicode_AsASCIIString(PyObject *unicode)
6174{
6175 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176}
6177
Victor Stinner99b95382011-07-04 14:23:54 +02006178#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006179
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006180/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006181
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006182#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006183#define NEED_RETRY
6184#endif
6185
6186/* XXX This code is limited to "true" double-byte encodings, as
6187 a) it assumes an incomplete character consists of a single byte, and
6188 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006190
Alexander Belopolsky40018472011-02-26 01:02:56 +00006191static int
6192is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006193{
6194 const char *curr = s + offset;
6195
6196 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 const char *prev = CharPrev(s, curr);
6198 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006199 }
6200 return 0;
6201}
6202
6203/*
6204 * Decode MBCS string into unicode object. If 'final' is set, converts
6205 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6206 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006207static int
6208decode_mbcs(PyUnicodeObject **v,
6209 const char *s, /* MBCS string */
6210 int size, /* sizeof MBCS string */
6211 int final,
6212 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006213{
6214 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006215 Py_ssize_t n;
6216 DWORD usize;
6217 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006218
6219 assert(size >= 0);
6220
Victor Stinner554f3f02010-06-16 23:33:54 +00006221 /* check and handle 'errors' arg */
6222 if (errors==NULL || strcmp(errors, "strict")==0)
6223 flags = MB_ERR_INVALID_CHARS;
6224 else if (strcmp(errors, "ignore")==0)
6225 flags = 0;
6226 else {
6227 PyErr_Format(PyExc_ValueError,
6228 "mbcs encoding does not support errors='%s'",
6229 errors);
6230 return -1;
6231 }
6232
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006233 /* Skip trailing lead-byte unless 'final' is set */
6234 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236
6237 /* First get the size of the result */
6238 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006239 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6240 if (usize==0)
6241 goto mbcs_decode_error;
6242 } else
6243 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006244
6245 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 /* Create unicode object */
6247 *v = _PyUnicode_New(usize);
6248 if (*v == NULL)
6249 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006250 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006251 }
6252 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 /* Extend unicode object */
6254 n = PyUnicode_GET_SIZE(*v);
6255 if (_PyUnicode_Resize(v, n + usize) < 0)
6256 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006257 }
6258
6259 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006260 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006262 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6263 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006265 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006266 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006267
6268mbcs_decode_error:
6269 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6270 we raise a UnicodeDecodeError - else it is a 'generic'
6271 windows error
6272 */
6273 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6274 /* Ideally, we should get reason from FormatMessage - this
6275 is the Windows 2000 English version of the message
6276 */
6277 PyObject *exc = NULL;
6278 const char *reason = "No mapping for the Unicode character exists "
6279 "in the target multi-byte code page.";
6280 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6281 if (exc != NULL) {
6282 PyCodec_StrictErrors(exc);
6283 Py_DECREF(exc);
6284 }
6285 } else {
6286 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6287 }
6288 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006289}
6290
Alexander Belopolsky40018472011-02-26 01:02:56 +00006291PyObject *
6292PyUnicode_DecodeMBCSStateful(const char *s,
6293 Py_ssize_t size,
6294 const char *errors,
6295 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006296{
6297 PyUnicodeObject *v = NULL;
6298 int done;
6299
6300 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006302
6303#ifdef NEED_RETRY
6304 retry:
6305 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006306 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006307 else
6308#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006309 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006310
6311 if (done < 0) {
6312 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006314 }
6315
6316 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006318
6319#ifdef NEED_RETRY
6320 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 s += done;
6322 size -= done;
6323 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006324 }
6325#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006326 if (PyUnicode_READY(v) == -1) {
6327 Py_DECREF(v);
6328 return NULL;
6329 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006330 return (PyObject *)v;
6331}
6332
Alexander Belopolsky40018472011-02-26 01:02:56 +00006333PyObject *
6334PyUnicode_DecodeMBCS(const char *s,
6335 Py_ssize_t size,
6336 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006337{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006338 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6339}
6340
6341/*
6342 * Convert unicode into string object (MBCS).
6343 * Returns 0 if succeed, -1 otherwise.
6344 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006345static int
6346encode_mbcs(PyObject **repr,
6347 const Py_UNICODE *p, /* unicode */
6348 int size, /* size of unicode */
6349 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006350{
Victor Stinner554f3f02010-06-16 23:33:54 +00006351 BOOL usedDefaultChar = FALSE;
6352 BOOL *pusedDefaultChar;
6353 int mbcssize;
6354 Py_ssize_t n;
6355 PyObject *exc = NULL;
6356 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006357
6358 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006359
Victor Stinner554f3f02010-06-16 23:33:54 +00006360 /* check and handle 'errors' arg */
6361 if (errors==NULL || strcmp(errors, "strict")==0) {
6362 flags = WC_NO_BEST_FIT_CHARS;
6363 pusedDefaultChar = &usedDefaultChar;
6364 } else if (strcmp(errors, "replace")==0) {
6365 flags = 0;
6366 pusedDefaultChar = NULL;
6367 } else {
6368 PyErr_Format(PyExc_ValueError,
6369 "mbcs encoding does not support errors='%s'",
6370 errors);
6371 return -1;
6372 }
6373
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006374 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006375 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006376 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6377 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 if (mbcssize == 0) {
6379 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6380 return -1;
6381 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006382 /* If we used a default char, then we failed! */
6383 if (pusedDefaultChar && *pusedDefaultChar)
6384 goto mbcs_encode_error;
6385 } else {
6386 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006387 }
6388
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006389 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 /* Create string object */
6391 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6392 if (*repr == NULL)
6393 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006394 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006395 }
6396 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 /* Extend string object */
6398 n = PyBytes_Size(*repr);
6399 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6400 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006401 }
6402
6403 /* Do the conversion */
6404 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006406 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6407 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6409 return -1;
6410 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006411 if (pusedDefaultChar && *pusedDefaultChar)
6412 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006413 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006414 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006415
6416mbcs_encode_error:
6417 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6418 Py_XDECREF(exc);
6419 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006420}
6421
Alexander Belopolsky40018472011-02-26 01:02:56 +00006422PyObject *
6423PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6424 Py_ssize_t size,
6425 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006426{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006427 PyObject *repr = NULL;
6428 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006429
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006430#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006432 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006433 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006434 else
6435#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006436 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006437
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006438 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 Py_XDECREF(repr);
6440 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006441 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006442
6443#ifdef NEED_RETRY
6444 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 p += INT_MAX;
6446 size -= INT_MAX;
6447 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006448 }
6449#endif
6450
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006451 return repr;
6452}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006453
Alexander Belopolsky40018472011-02-26 01:02:56 +00006454PyObject *
6455PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006456{
6457 if (!PyUnicode_Check(unicode)) {
6458 PyErr_BadArgument();
6459 return NULL;
6460 }
6461 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 PyUnicode_GET_SIZE(unicode),
6463 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006464}
6465
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006466#undef NEED_RETRY
6467
Victor Stinner99b95382011-07-04 14:23:54 +02006468#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006469
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470/* --- Character Mapping Codec -------------------------------------------- */
6471
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472PyObject *
6473PyUnicode_DecodeCharmap(const char *s,
6474 Py_ssize_t size,
6475 PyObject *mapping,
6476 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006479 Py_ssize_t startinpos;
6480 Py_ssize_t endinpos;
6481 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 PyUnicodeObject *v;
6484 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006485 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006486 PyObject *errorHandler = NULL;
6487 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006488 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006489 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006490
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 /* Default to Latin-1 */
6492 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
6495 v = _PyUnicode_New(size);
6496 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006501 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006502 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 mapstring = PyUnicode_AS_UNICODE(mapping);
6504 maplen = PyUnicode_GET_SIZE(mapping);
6505 while (s < e) {
6506 unsigned char ch = *s;
6507 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 if (ch < maplen)
6510 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 if (x == 0xfffe) {
6513 /* undefined mapping */
6514 outpos = p-PyUnicode_AS_UNICODE(v);
6515 startinpos = s-starts;
6516 endinpos = startinpos+1;
6517 if (unicode_decode_call_errorhandler(
6518 errors, &errorHandler,
6519 "charmap", "character maps to <undefined>",
6520 &starts, &e, &startinpos, &endinpos, &exc, &s,
6521 &v, &outpos, &p)) {
6522 goto onError;
6523 }
6524 continue;
6525 }
6526 *p++ = x;
6527 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006528 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006529 }
6530 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 while (s < e) {
6532 unsigned char ch = *s;
6533 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006534
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6536 w = PyLong_FromLong((long)ch);
6537 if (w == NULL)
6538 goto onError;
6539 x = PyObject_GetItem(mapping, w);
6540 Py_DECREF(w);
6541 if (x == NULL) {
6542 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6543 /* No mapping found means: mapping is undefined. */
6544 PyErr_Clear();
6545 x = Py_None;
6546 Py_INCREF(x);
6547 } else
6548 goto onError;
6549 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006550
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 /* Apply mapping */
6552 if (PyLong_Check(x)) {
6553 long value = PyLong_AS_LONG(x);
6554 if (value < 0 || value > 65535) {
6555 PyErr_SetString(PyExc_TypeError,
6556 "character mapping must be in range(65536)");
6557 Py_DECREF(x);
6558 goto onError;
6559 }
6560 *p++ = (Py_UNICODE)value;
6561 }
6562 else if (x == Py_None) {
6563 /* undefined mapping */
6564 outpos = p-PyUnicode_AS_UNICODE(v);
6565 startinpos = s-starts;
6566 endinpos = startinpos+1;
6567 if (unicode_decode_call_errorhandler(
6568 errors, &errorHandler,
6569 "charmap", "character maps to <undefined>",
6570 &starts, &e, &startinpos, &endinpos, &exc, &s,
6571 &v, &outpos, &p)) {
6572 Py_DECREF(x);
6573 goto onError;
6574 }
6575 Py_DECREF(x);
6576 continue;
6577 }
6578 else if (PyUnicode_Check(x)) {
6579 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006580
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 if (targetsize == 1)
6582 /* 1-1 mapping */
6583 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006584
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 else if (targetsize > 1) {
6586 /* 1-n mapping */
6587 if (targetsize > extrachars) {
6588 /* resize first */
6589 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6590 Py_ssize_t needed = (targetsize - extrachars) + \
6591 (targetsize << 2);
6592 extrachars += needed;
6593 /* XXX overflow detection missing */
6594 if (_PyUnicode_Resize(&v,
6595 PyUnicode_GET_SIZE(v) + needed) < 0) {
6596 Py_DECREF(x);
6597 goto onError;
6598 }
6599 p = PyUnicode_AS_UNICODE(v) + oldpos;
6600 }
6601 Py_UNICODE_COPY(p,
6602 PyUnicode_AS_UNICODE(x),
6603 targetsize);
6604 p += targetsize;
6605 extrachars -= targetsize;
6606 }
6607 /* 1-0 mapping: skip the character */
6608 }
6609 else {
6610 /* wrong return value */
6611 PyErr_SetString(PyExc_TypeError,
6612 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006613 Py_DECREF(x);
6614 goto onError;
6615 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 Py_DECREF(x);
6617 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 }
6620 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6622 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006623 Py_XDECREF(errorHandler);
6624 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006625 if (PyUnicode_READY(v) == -1) {
6626 Py_DECREF(v);
6627 return NULL;
6628 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006630
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 Py_XDECREF(v);
6635 return NULL;
6636}
6637
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006638/* Charmap encoding: the lookup table */
6639
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 PyObject_HEAD
6642 unsigned char level1[32];
6643 int count2, count3;
6644 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006645};
6646
6647static PyObject*
6648encoding_map_size(PyObject *obj, PyObject* args)
6649{
6650 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006651 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006653}
6654
6655static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006656 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 PyDoc_STR("Return the size (in bytes) of this object") },
6658 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006659};
6660
6661static void
6662encoding_map_dealloc(PyObject* o)
6663{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006664 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006665}
6666
6667static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006668 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 "EncodingMap", /*tp_name*/
6670 sizeof(struct encoding_map), /*tp_basicsize*/
6671 0, /*tp_itemsize*/
6672 /* methods */
6673 encoding_map_dealloc, /*tp_dealloc*/
6674 0, /*tp_print*/
6675 0, /*tp_getattr*/
6676 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006677 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 0, /*tp_repr*/
6679 0, /*tp_as_number*/
6680 0, /*tp_as_sequence*/
6681 0, /*tp_as_mapping*/
6682 0, /*tp_hash*/
6683 0, /*tp_call*/
6684 0, /*tp_str*/
6685 0, /*tp_getattro*/
6686 0, /*tp_setattro*/
6687 0, /*tp_as_buffer*/
6688 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6689 0, /*tp_doc*/
6690 0, /*tp_traverse*/
6691 0, /*tp_clear*/
6692 0, /*tp_richcompare*/
6693 0, /*tp_weaklistoffset*/
6694 0, /*tp_iter*/
6695 0, /*tp_iternext*/
6696 encoding_map_methods, /*tp_methods*/
6697 0, /*tp_members*/
6698 0, /*tp_getset*/
6699 0, /*tp_base*/
6700 0, /*tp_dict*/
6701 0, /*tp_descr_get*/
6702 0, /*tp_descr_set*/
6703 0, /*tp_dictoffset*/
6704 0, /*tp_init*/
6705 0, /*tp_alloc*/
6706 0, /*tp_new*/
6707 0, /*tp_free*/
6708 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006709};
6710
6711PyObject*
6712PyUnicode_BuildEncodingMap(PyObject* string)
6713{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006714 PyObject *result;
6715 struct encoding_map *mresult;
6716 int i;
6717 int need_dict = 0;
6718 unsigned char level1[32];
6719 unsigned char level2[512];
6720 unsigned char *mlevel1, *mlevel2, *mlevel3;
6721 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006722 int kind;
6723 void *data;
6724 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006726 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006727 PyErr_BadArgument();
6728 return NULL;
6729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006730 kind = PyUnicode_KIND(string);
6731 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006732 memset(level1, 0xFF, sizeof level1);
6733 memset(level2, 0xFF, sizeof level2);
6734
6735 /* If there isn't a one-to-one mapping of NULL to \0,
6736 or if there are non-BMP characters, we need to use
6737 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006739 need_dict = 1;
6740 for (i = 1; i < 256; i++) {
6741 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006742 ch = PyUnicode_READ(kind, data, i);
6743 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006744 need_dict = 1;
6745 break;
6746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006747 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006748 /* unmapped character */
6749 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006750 l1 = ch >> 11;
6751 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006752 if (level1[l1] == 0xFF)
6753 level1[l1] = count2++;
6754 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006755 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006756 }
6757
6758 if (count2 >= 0xFF || count3 >= 0xFF)
6759 need_dict = 1;
6760
6761 if (need_dict) {
6762 PyObject *result = PyDict_New();
6763 PyObject *key, *value;
6764 if (!result)
6765 return NULL;
6766 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006767 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006768 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006769 if (!key || !value)
6770 goto failed1;
6771 if (PyDict_SetItem(result, key, value) == -1)
6772 goto failed1;
6773 Py_DECREF(key);
6774 Py_DECREF(value);
6775 }
6776 return result;
6777 failed1:
6778 Py_XDECREF(key);
6779 Py_XDECREF(value);
6780 Py_DECREF(result);
6781 return NULL;
6782 }
6783
6784 /* Create a three-level trie */
6785 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6786 16*count2 + 128*count3 - 1);
6787 if (!result)
6788 return PyErr_NoMemory();
6789 PyObject_Init(result, &EncodingMapType);
6790 mresult = (struct encoding_map*)result;
6791 mresult->count2 = count2;
6792 mresult->count3 = count3;
6793 mlevel1 = mresult->level1;
6794 mlevel2 = mresult->level23;
6795 mlevel3 = mresult->level23 + 16*count2;
6796 memcpy(mlevel1, level1, 32);
6797 memset(mlevel2, 0xFF, 16*count2);
6798 memset(mlevel3, 0, 128*count3);
6799 count3 = 0;
6800 for (i = 1; i < 256; i++) {
6801 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006802 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006803 /* unmapped character */
6804 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006805 o1 = PyUnicode_READ(kind, data, i)>>11;
6806 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006807 i2 = 16*mlevel1[o1] + o2;
6808 if (mlevel2[i2] == 0xFF)
6809 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006810 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006811 i3 = 128*mlevel2[i2] + o3;
6812 mlevel3[i3] = i;
6813 }
6814 return result;
6815}
6816
6817static int
6818encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6819{
6820 struct encoding_map *map = (struct encoding_map*)mapping;
6821 int l1 = c>>11;
6822 int l2 = (c>>7) & 0xF;
6823 int l3 = c & 0x7F;
6824 int i;
6825
6826#ifdef Py_UNICODE_WIDE
6827 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006829 }
6830#endif
6831 if (c == 0)
6832 return 0;
6833 /* level 1*/
6834 i = map->level1[l1];
6835 if (i == 0xFF) {
6836 return -1;
6837 }
6838 /* level 2*/
6839 i = map->level23[16*i+l2];
6840 if (i == 0xFF) {
6841 return -1;
6842 }
6843 /* level 3 */
6844 i = map->level23[16*map->count2 + 128*i + l3];
6845 if (i == 0) {
6846 return -1;
6847 }
6848 return i;
6849}
6850
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851/* Lookup the character ch in the mapping. If the character
6852 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006853 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006854static PyObject *
6855charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856{
Christian Heimes217cfd12007-12-02 14:31:20 +00006857 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 PyObject *x;
6859
6860 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862 x = PyObject_GetItem(mapping, w);
6863 Py_DECREF(w);
6864 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6866 /* No mapping found means: mapping is undefined. */
6867 PyErr_Clear();
6868 x = Py_None;
6869 Py_INCREF(x);
6870 return x;
6871 } else
6872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006874 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006876 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 long value = PyLong_AS_LONG(x);
6878 if (value < 0 || value > 255) {
6879 PyErr_SetString(PyExc_TypeError,
6880 "character mapping must be in range(256)");
6881 Py_DECREF(x);
6882 return NULL;
6883 }
6884 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006886 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 /* wrong return value */
6890 PyErr_Format(PyExc_TypeError,
6891 "character mapping must return integer, bytes or None, not %.400s",
6892 x->ob_type->tp_name);
6893 Py_DECREF(x);
6894 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 }
6896}
6897
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006898static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006899charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006900{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006901 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6902 /* exponentially overallocate to minimize reallocations */
6903 if (requiredsize < 2*outsize)
6904 requiredsize = 2*outsize;
6905 if (_PyBytes_Resize(outobj, requiredsize))
6906 return -1;
6907 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006908}
6909
Benjamin Peterson14339b62009-01-31 16:36:08 +00006910typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006914 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915 space is available. Return a new reference to the object that
6916 was put in the output buffer, or Py_None, if the mapping was undefined
6917 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006918 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006919static charmapencode_result
6920charmapencode_output(Py_UNICODE c, PyObject *mapping,
6921 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006923 PyObject *rep;
6924 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006925 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006926
Christian Heimes90aa7642007-12-19 02:45:37 +00006927 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006928 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006930 if (res == -1)
6931 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 if (outsize<requiredsize)
6933 if (charmapencode_resize(outobj, outpos, requiredsize))
6934 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006935 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 outstart[(*outpos)++] = (char)res;
6937 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006938 }
6939
6940 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006943 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 Py_DECREF(rep);
6945 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006946 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 if (PyLong_Check(rep)) {
6948 Py_ssize_t requiredsize = *outpos+1;
6949 if (outsize<requiredsize)
6950 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6951 Py_DECREF(rep);
6952 return enc_EXCEPTION;
6953 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006954 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006956 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 else {
6958 const char *repchars = PyBytes_AS_STRING(rep);
6959 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6960 Py_ssize_t requiredsize = *outpos+repsize;
6961 if (outsize<requiredsize)
6962 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6963 Py_DECREF(rep);
6964 return enc_EXCEPTION;
6965 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006966 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 memcpy(outstart + *outpos, repchars, repsize);
6968 *outpos += repsize;
6969 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006970 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006971 Py_DECREF(rep);
6972 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006973}
6974
6975/* handle an error in PyUnicode_EncodeCharmap
6976 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006977static int
6978charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006979 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006980 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006981 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006982 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006983{
6984 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006985 Py_ssize_t repsize;
6986 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006987 Py_UNICODE *uni2;
6988 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006989 Py_ssize_t collstartpos = *inpos;
6990 Py_ssize_t collendpos = *inpos+1;
6991 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006992 char *encoding = "charmap";
6993 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006994 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006995
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006996 /* find all unencodable characters */
6997 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006998 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006999 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 int res = encoding_map_lookup(p[collendpos], mapping);
7001 if (res != -1)
7002 break;
7003 ++collendpos;
7004 continue;
7005 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007006
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 rep = charmapencode_lookup(p[collendpos], mapping);
7008 if (rep==NULL)
7009 return -1;
7010 else if (rep!=Py_None) {
7011 Py_DECREF(rep);
7012 break;
7013 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007014 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016 }
7017 /* cache callback name lookup
7018 * (if not done yet, i.e. it's the first error) */
7019 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 if ((errors==NULL) || (!strcmp(errors, "strict")))
7021 *known_errorHandler = 1;
7022 else if (!strcmp(errors, "replace"))
7023 *known_errorHandler = 2;
7024 else if (!strcmp(errors, "ignore"))
7025 *known_errorHandler = 3;
7026 else if (!strcmp(errors, "xmlcharrefreplace"))
7027 *known_errorHandler = 4;
7028 else
7029 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007030 }
7031 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007032 case 1: /* strict */
7033 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7034 return -1;
7035 case 2: /* replace */
7036 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 x = charmapencode_output('?', mapping, res, respos);
7038 if (x==enc_EXCEPTION) {
7039 return -1;
7040 }
7041 else if (x==enc_FAILED) {
7042 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7043 return -1;
7044 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007045 }
7046 /* fall through */
7047 case 3: /* ignore */
7048 *inpos = collendpos;
7049 break;
7050 case 4: /* xmlcharrefreplace */
7051 /* generate replacement (temporarily (mis)uses p) */
7052 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 char buffer[2+29+1+1];
7054 char *cp;
7055 sprintf(buffer, "&#%d;", (int)p[collpos]);
7056 for (cp = buffer; *cp; ++cp) {
7057 x = charmapencode_output(*cp, mapping, res, respos);
7058 if (x==enc_EXCEPTION)
7059 return -1;
7060 else if (x==enc_FAILED) {
7061 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7062 return -1;
7063 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007064 }
7065 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007066 *inpos = collendpos;
7067 break;
7068 default:
7069 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 encoding, reason, p, size, exceptionObject,
7071 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007072 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007074 if (PyBytes_Check(repunicode)) {
7075 /* Directly copy bytes result to output. */
7076 Py_ssize_t outsize = PyBytes_Size(*res);
7077 Py_ssize_t requiredsize;
7078 repsize = PyBytes_Size(repunicode);
7079 requiredsize = *respos + repsize;
7080 if (requiredsize > outsize)
7081 /* Make room for all additional bytes. */
7082 if (charmapencode_resize(res, respos, requiredsize)) {
7083 Py_DECREF(repunicode);
7084 return -1;
7085 }
7086 memcpy(PyBytes_AsString(*res) + *respos,
7087 PyBytes_AsString(repunicode), repsize);
7088 *respos += repsize;
7089 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007090 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007091 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007092 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007093 /* generate replacement */
7094 repsize = PyUnicode_GET_SIZE(repunicode);
7095 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 x = charmapencode_output(*uni2, mapping, res, respos);
7097 if (x==enc_EXCEPTION) {
7098 return -1;
7099 }
7100 else if (x==enc_FAILED) {
7101 Py_DECREF(repunicode);
7102 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7103 return -1;
7104 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007105 }
7106 *inpos = newpos;
7107 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007108 }
7109 return 0;
7110}
7111
Alexander Belopolsky40018472011-02-26 01:02:56 +00007112PyObject *
7113PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7114 Py_ssize_t size,
7115 PyObject *mapping,
7116 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007118 /* output object */
7119 PyObject *res = NULL;
7120 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007121 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007122 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007123 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007124 PyObject *errorHandler = NULL;
7125 PyObject *exc = NULL;
7126 /* the following variable is used for caching string comparisons
7127 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7128 * 3=ignore, 4=xmlcharrefreplace */
7129 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
7131 /* Default to Latin-1 */
7132 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135 /* allocate enough for a simple encoding without
7136 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007137 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007138 if (res == NULL)
7139 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007140 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 /* try to encode it */
7145 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7146 if (x==enc_EXCEPTION) /* error */
7147 goto onError;
7148 if (x==enc_FAILED) { /* unencodable character */
7149 if (charmap_encoding_error(p, size, &inpos, mapping,
7150 &exc,
7151 &known_errorHandler, &errorHandler, errors,
7152 &res, &respos)) {
7153 goto onError;
7154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007155 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 else
7157 /* done with this character => adjust input position */
7158 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007162 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007163 if (_PyBytes_Resize(&res, respos) < 0)
7164 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007165
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007166 Py_XDECREF(exc);
7167 Py_XDECREF(errorHandler);
7168 return res;
7169
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007171 Py_XDECREF(res);
7172 Py_XDECREF(exc);
7173 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 return NULL;
7175}
7176
Alexander Belopolsky40018472011-02-26 01:02:56 +00007177PyObject *
7178PyUnicode_AsCharmapString(PyObject *unicode,
7179 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180{
7181 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 PyErr_BadArgument();
7183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 }
7185 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 PyUnicode_GET_SIZE(unicode),
7187 mapping,
7188 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189}
7190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007192static void
7193make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007194 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007195 Py_ssize_t startpos, Py_ssize_t endpos,
7196 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007199 *exceptionObject = _PyUnicodeTranslateError_Create(
7200 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 }
7202 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7204 goto onError;
7205 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7206 goto onError;
7207 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7208 goto onError;
7209 return;
7210 onError:
7211 Py_DECREF(*exceptionObject);
7212 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 }
7214}
7215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007216/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007217static void
7218raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007219 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007220 Py_ssize_t startpos, Py_ssize_t endpos,
7221 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007222{
7223 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007225 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227}
7228
7229/* error handling callback helper:
7230 build arguments, call the callback and check the arguments,
7231 put the result into newpos and return the replacement string, which
7232 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007233static PyObject *
7234unicode_translate_call_errorhandler(const char *errors,
7235 PyObject **errorHandler,
7236 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007237 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007238 Py_ssize_t startpos, Py_ssize_t endpos,
7239 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007240{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007241 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007242
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007243 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007244 PyObject *restuple;
7245 PyObject *resunicode;
7246
7247 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007249 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007251 }
7252
7253 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007254 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007255 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007256 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007257
7258 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007263 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 Py_DECREF(restuple);
7265 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007266 }
7267 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 &resunicode, &i_newpos)) {
7269 Py_DECREF(restuple);
7270 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007271 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007272 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007273 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007274 else
7275 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007276 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7278 Py_DECREF(restuple);
7279 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007280 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007281 Py_INCREF(resunicode);
7282 Py_DECREF(restuple);
7283 return resunicode;
7284}
7285
7286/* Lookup the character ch in the mapping and put the result in result,
7287 which must be decrefed by the caller.
7288 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007289static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007291{
Christian Heimes217cfd12007-12-02 14:31:20 +00007292 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007293 PyObject *x;
7294
7295 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007297 x = PyObject_GetItem(mapping, w);
7298 Py_DECREF(w);
7299 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7301 /* No mapping found means: use 1:1 mapping. */
7302 PyErr_Clear();
7303 *result = NULL;
7304 return 0;
7305 } else
7306 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307 }
7308 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 *result = x;
7310 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007311 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007312 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 long value = PyLong_AS_LONG(x);
7314 long max = PyUnicode_GetMax();
7315 if (value < 0 || value > max) {
7316 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007317 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 Py_DECREF(x);
7319 return -1;
7320 }
7321 *result = x;
7322 return 0;
7323 }
7324 else if (PyUnicode_Check(x)) {
7325 *result = x;
7326 return 0;
7327 }
7328 else {
7329 /* wrong return value */
7330 PyErr_SetString(PyExc_TypeError,
7331 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007332 Py_DECREF(x);
7333 return -1;
7334 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007335}
7336/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 if not reallocate and adjust various state variables.
7338 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007339static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007340charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007343 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007344 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 /* exponentially overallocate to minimize reallocations */
7346 if (requiredsize < 2 * oldsize)
7347 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007348 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7349 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007351 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007352 }
7353 return 0;
7354}
7355/* lookup the character, put the result in the output string and adjust
7356 various state variables. Return a new reference to the object that
7357 was put in the output buffer in *result, or Py_None, if the mapping was
7358 undefined (in which case no character was written).
7359 The called must decref result.
7360 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007361static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007362charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7363 PyObject *mapping, Py_UCS4 **output,
7364 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007365 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7368 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007370 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007372 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373 }
7374 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007376 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007378 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007379 }
7380 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007381 Py_ssize_t repsize;
7382 if (PyUnicode_READY(*res) == -1)
7383 return -1;
7384 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 if (repsize==1) {
7386 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007387 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 }
7389 else if (repsize!=0) {
7390 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 Py_ssize_t requiredsize = *opos +
7392 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007394 Py_ssize_t i;
7395 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007397 for(i = 0; i < repsize; i++)
7398 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007400 }
7401 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007403 return 0;
7404}
7405
Alexander Belopolsky40018472011-02-26 01:02:56 +00007406PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007407_PyUnicode_TranslateCharmap(PyObject *input,
7408 PyObject *mapping,
7409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007411 /* input object */
7412 char *idata;
7413 Py_ssize_t size, i;
7414 int kind;
7415 /* output buffer */
7416 Py_UCS4 *output = NULL;
7417 Py_ssize_t osize;
7418 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007419 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007421 char *reason = "character maps to <undefined>";
7422 PyObject *errorHandler = NULL;
7423 PyObject *exc = NULL;
7424 /* the following variable is used for caching string comparisons
7425 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7426 * 3=ignore, 4=xmlcharrefreplace */
7427 int known_errorHandler = -1;
7428
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 PyErr_BadArgument();
7431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007434 if (PyUnicode_READY(input) == -1)
7435 return NULL;
7436 idata = (char*)PyUnicode_DATA(input);
7437 kind = PyUnicode_KIND(input);
7438 size = PyUnicode_GET_LENGTH(input);
7439 i = 0;
7440
7441 if (size == 0) {
7442 Py_INCREF(input);
7443 return input;
7444 }
7445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007446 /* allocate enough for a simple 1:1 translation without
7447 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007448 osize = size;
7449 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7450 opos = 0;
7451 if (output == NULL) {
7452 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007456 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 /* try to encode it */
7458 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007459 if (charmaptranslate_output(input, i, mapping,
7460 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 Py_XDECREF(x);
7462 goto onError;
7463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007464 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007466 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 else { /* untranslatable character */
7468 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7469 Py_ssize_t repsize;
7470 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007471 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007473 Py_ssize_t collstart = i;
7474 Py_ssize_t collend = i+1;
7475 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007478 while (collend < size) {
7479 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 goto onError;
7481 Py_XDECREF(x);
7482 if (x!=Py_None)
7483 break;
7484 ++collend;
7485 }
7486 /* cache callback name lookup
7487 * (if not done yet, i.e. it's the first error) */
7488 if (known_errorHandler==-1) {
7489 if ((errors==NULL) || (!strcmp(errors, "strict")))
7490 known_errorHandler = 1;
7491 else if (!strcmp(errors, "replace"))
7492 known_errorHandler = 2;
7493 else if (!strcmp(errors, "ignore"))
7494 known_errorHandler = 3;
7495 else if (!strcmp(errors, "xmlcharrefreplace"))
7496 known_errorHandler = 4;
7497 else
7498 known_errorHandler = 0;
7499 }
7500 switch (known_errorHandler) {
7501 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007502 raise_translate_exception(&exc, input, collstart,
7503 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007504 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 case 2: /* replace */
7506 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007507 for (coll = collstart; coll<collend; coll++)
7508 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 /* fall through */
7510 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007511 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 break;
7513 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007514 /* generate replacement (temporarily (mis)uses i) */
7515 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 char buffer[2+29+1+1];
7517 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007518 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7519 if (charmaptranslate_makespace(&output, &osize,
7520 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 goto onError;
7522 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007523 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007525 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 break;
7527 default:
7528 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007529 reason, input, &exc,
7530 collstart, collend, &newpos);
7531 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 goto onError;
7533 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007534 repsize = PyUnicode_GET_LENGTH(repunicode);
7535 if (charmaptranslate_makespace(&output, &osize,
7536 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 Py_DECREF(repunicode);
7538 goto onError;
7539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007540 for (uni2 = 0; repsize-->0; ++uni2)
7541 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7542 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007544 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007545 }
7546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007547 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7548 if (!res)
7549 goto onError;
7550 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007551 Py_XDECREF(exc);
7552 Py_XDECREF(errorHandler);
7553 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007556 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557 Py_XDECREF(exc);
7558 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 return NULL;
7560}
7561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007562/* Deprecated. Use PyUnicode_Translate instead. */
7563PyObject *
7564PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7565 Py_ssize_t size,
7566 PyObject *mapping,
7567 const char *errors)
7568{
7569 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7570 if (!unicode)
7571 return NULL;
7572 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7573}
7574
Alexander Belopolsky40018472011-02-26 01:02:56 +00007575PyObject *
7576PyUnicode_Translate(PyObject *str,
7577 PyObject *mapping,
7578 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579{
7580 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007581
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 str = PyUnicode_FromObject(str);
7583 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007585 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 Py_DECREF(str);
7587 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007588
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 Py_XDECREF(str);
7591 return NULL;
7592}
Tim Petersced69f82003-09-16 20:30:58 +00007593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007594static Py_UCS4
7595fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7596{
7597 /* No need to call PyUnicode_READY(self) because this function is only
7598 called as a callback from fixup() which does it already. */
7599 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7600 const int kind = PyUnicode_KIND(self);
7601 void *data = PyUnicode_DATA(self);
7602 Py_UCS4 maxchar = 0, ch, fixed;
7603 Py_ssize_t i;
7604
7605 for (i = 0; i < len; ++i) {
7606 ch = PyUnicode_READ(kind, data, i);
7607 fixed = 0;
7608 if (ch > 127) {
7609 if (Py_UNICODE_ISSPACE(ch))
7610 fixed = ' ';
7611 else {
7612 const int decimal = Py_UNICODE_TODECIMAL(ch);
7613 if (decimal >= 0)
7614 fixed = '0' + decimal;
7615 }
7616 if (fixed != 0) {
7617 if (fixed > maxchar)
7618 maxchar = fixed;
7619 PyUnicode_WRITE(kind, data, i, fixed);
7620 }
7621 else if (ch > maxchar)
7622 maxchar = ch;
7623 }
7624 else if (ch > maxchar)
7625 maxchar = ch;
7626 }
7627
7628 return maxchar;
7629}
7630
7631PyObject *
7632_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7633{
7634 if (!PyUnicode_Check(unicode)) {
7635 PyErr_BadInternalCall();
7636 return NULL;
7637 }
7638 if (PyUnicode_READY(unicode) == -1)
7639 return NULL;
7640 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7641 /* If the string is already ASCII, just return the same string */
7642 Py_INCREF(unicode);
7643 return unicode;
7644 }
7645 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7646}
7647
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007648PyObject *
7649PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7650 Py_ssize_t length)
7651{
7652 PyObject *result;
7653 Py_UNICODE *p; /* write pointer into result */
7654 Py_ssize_t i;
7655 /* Copy to a new string */
7656 result = (PyObject *)_PyUnicode_New(length);
7657 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7658 if (result == NULL)
7659 return result;
7660 p = PyUnicode_AS_UNICODE(result);
7661 /* Iterate over code points */
7662 for (i = 0; i < length; i++) {
7663 Py_UNICODE ch =s[i];
7664 if (ch > 127) {
7665 int decimal = Py_UNICODE_TODECIMAL(ch);
7666 if (decimal >= 0)
7667 p[i] = '0' + decimal;
7668 }
7669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007670 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7671 Py_DECREF(result);
7672 return NULL;
7673 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007674 return result;
7675}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007676/* --- Decimal Encoder ---------------------------------------------------- */
7677
Alexander Belopolsky40018472011-02-26 01:02:56 +00007678int
7679PyUnicode_EncodeDecimal(Py_UNICODE *s,
7680 Py_ssize_t length,
7681 char *output,
7682 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007683{
7684 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007685 PyObject *errorHandler = NULL;
7686 PyObject *exc = NULL;
7687 const char *encoding = "decimal";
7688 const char *reason = "invalid decimal Unicode string";
7689 /* the following variable is used for caching string comparisons
7690 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7691 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007692
7693 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 PyErr_BadArgument();
7695 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007696 }
7697
7698 p = s;
7699 end = s + length;
7700 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 register Py_UNICODE ch = *p;
7702 int decimal;
7703 PyObject *repunicode;
7704 Py_ssize_t repsize;
7705 Py_ssize_t newpos;
7706 Py_UNICODE *uni2;
7707 Py_UNICODE *collstart;
7708 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007709
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007711 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 ++p;
7713 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007714 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 decimal = Py_UNICODE_TODECIMAL(ch);
7716 if (decimal >= 0) {
7717 *output++ = '0' + decimal;
7718 ++p;
7719 continue;
7720 }
7721 if (0 < ch && ch < 256) {
7722 *output++ = (char)ch;
7723 ++p;
7724 continue;
7725 }
7726 /* All other characters are considered unencodable */
7727 collstart = p;
7728 collend = p+1;
7729 while (collend < end) {
7730 if ((0 < *collend && *collend < 256) ||
7731 !Py_UNICODE_ISSPACE(*collend) ||
7732 Py_UNICODE_TODECIMAL(*collend))
7733 break;
7734 }
7735 /* cache callback name lookup
7736 * (if not done yet, i.e. it's the first error) */
7737 if (known_errorHandler==-1) {
7738 if ((errors==NULL) || (!strcmp(errors, "strict")))
7739 known_errorHandler = 1;
7740 else if (!strcmp(errors, "replace"))
7741 known_errorHandler = 2;
7742 else if (!strcmp(errors, "ignore"))
7743 known_errorHandler = 3;
7744 else if (!strcmp(errors, "xmlcharrefreplace"))
7745 known_errorHandler = 4;
7746 else
7747 known_errorHandler = 0;
7748 }
7749 switch (known_errorHandler) {
7750 case 1: /* strict */
7751 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7752 goto onError;
7753 case 2: /* replace */
7754 for (p = collstart; p < collend; ++p)
7755 *output++ = '?';
7756 /* fall through */
7757 case 3: /* ignore */
7758 p = collend;
7759 break;
7760 case 4: /* xmlcharrefreplace */
7761 /* generate replacement (temporarily (mis)uses p) */
7762 for (p = collstart; p < collend; ++p)
7763 output += sprintf(output, "&#%d;", (int)*p);
7764 p = collend;
7765 break;
7766 default:
7767 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7768 encoding, reason, s, length, &exc,
7769 collstart-s, collend-s, &newpos);
7770 if (repunicode == NULL)
7771 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007772 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007773 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007774 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7775 Py_DECREF(repunicode);
7776 goto onError;
7777 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 /* generate replacement */
7779 repsize = PyUnicode_GET_SIZE(repunicode);
7780 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7781 Py_UNICODE ch = *uni2;
7782 if (Py_UNICODE_ISSPACE(ch))
7783 *output++ = ' ';
7784 else {
7785 decimal = Py_UNICODE_TODECIMAL(ch);
7786 if (decimal >= 0)
7787 *output++ = '0' + decimal;
7788 else if (0 < ch && ch < 256)
7789 *output++ = (char)ch;
7790 else {
7791 Py_DECREF(repunicode);
7792 raise_encode_exception(&exc, encoding,
7793 s, length, collstart-s, collend-s, reason);
7794 goto onError;
7795 }
7796 }
7797 }
7798 p = s + newpos;
7799 Py_DECREF(repunicode);
7800 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007801 }
7802 /* 0-terminate the output string */
7803 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007804 Py_XDECREF(exc);
7805 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007806 return 0;
7807
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007809 Py_XDECREF(exc);
7810 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007811 return -1;
7812}
7813
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814/* --- Helpers ------------------------------------------------------------ */
7815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007816#include "stringlib/ucs1lib.h"
7817#include "stringlib/fastsearch.h"
7818#include "stringlib/partition.h"
7819#include "stringlib/split.h"
7820#include "stringlib/count.h"
7821#include "stringlib/find.h"
7822#include "stringlib/localeutil.h"
7823#include "stringlib/undef.h"
7824
7825#include "stringlib/ucs2lib.h"
7826#include "stringlib/fastsearch.h"
7827#include "stringlib/partition.h"
7828#include "stringlib/split.h"
7829#include "stringlib/count.h"
7830#include "stringlib/find.h"
7831#include "stringlib/localeutil.h"
7832#include "stringlib/undef.h"
7833
7834#include "stringlib/ucs4lib.h"
7835#include "stringlib/fastsearch.h"
7836#include "stringlib/partition.h"
7837#include "stringlib/split.h"
7838#include "stringlib/count.h"
7839#include "stringlib/find.h"
7840#include "stringlib/localeutil.h"
7841#include "stringlib/undef.h"
7842
7843static Py_ssize_t
7844any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7845 const Py_UCS1*, Py_ssize_t,
7846 Py_ssize_t, Py_ssize_t),
7847 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7848 const Py_UCS2*, Py_ssize_t,
7849 Py_ssize_t, Py_ssize_t),
7850 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7851 const Py_UCS4*, Py_ssize_t,
7852 Py_ssize_t, Py_ssize_t),
7853 PyObject* s1, PyObject* s2,
7854 Py_ssize_t start,
7855 Py_ssize_t end)
7856{
7857 int kind1, kind2, kind;
7858 void *buf1, *buf2;
7859 Py_ssize_t len1, len2, result;
7860
7861 kind1 = PyUnicode_KIND(s1);
7862 kind2 = PyUnicode_KIND(s2);
7863 kind = kind1 > kind2 ? kind1 : kind2;
7864 buf1 = PyUnicode_DATA(s1);
7865 buf2 = PyUnicode_DATA(s2);
7866 if (kind1 != kind)
7867 buf1 = _PyUnicode_AsKind(s1, kind);
7868 if (!buf1)
7869 return -2;
7870 if (kind2 != kind)
7871 buf2 = _PyUnicode_AsKind(s2, kind);
7872 if (!buf2) {
7873 if (kind1 != kind) PyMem_Free(buf1);
7874 return -2;
7875 }
7876 len1 = PyUnicode_GET_LENGTH(s1);
7877 len2 = PyUnicode_GET_LENGTH(s2);
7878
7879 switch(kind) {
7880 case PyUnicode_1BYTE_KIND:
7881 result = ucs1(buf1, len1, buf2, len2, start, end);
7882 break;
7883 case PyUnicode_2BYTE_KIND:
7884 result = ucs2(buf1, len1, buf2, len2, start, end);
7885 break;
7886 case PyUnicode_4BYTE_KIND:
7887 result = ucs4(buf1, len1, buf2, len2, start, end);
7888 break;
7889 default:
7890 assert(0); result = -2;
7891 }
7892
7893 if (kind1 != kind)
7894 PyMem_Free(buf1);
7895 if (kind2 != kind)
7896 PyMem_Free(buf2);
7897
7898 return result;
7899}
7900
7901Py_ssize_t
7902_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7903 Py_ssize_t n_buffer,
7904 void *digits, Py_ssize_t n_digits,
7905 Py_ssize_t min_width,
7906 const char *grouping,
7907 const char *thousands_sep)
7908{
7909 switch(kind) {
7910 case PyUnicode_1BYTE_KIND:
7911 return _PyUnicode_ucs1_InsertThousandsGrouping(
7912 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7913 min_width, grouping, thousands_sep);
7914 case PyUnicode_2BYTE_KIND:
7915 return _PyUnicode_ucs2_InsertThousandsGrouping(
7916 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7917 min_width, grouping, thousands_sep);
7918 case PyUnicode_4BYTE_KIND:
7919 return _PyUnicode_ucs4_InsertThousandsGrouping(
7920 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7921 min_width, grouping, thousands_sep);
7922 }
7923 assert(0);
7924 return -1;
7925}
7926
7927
Eric Smith8c663262007-08-25 02:26:07 +00007928#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007929#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007930
Thomas Wouters477c8d52006-05-27 19:21:47 +00007931#include "stringlib/count.h"
7932#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007933
Thomas Wouters477c8d52006-05-27 19:21:47 +00007934/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007935#define ADJUST_INDICES(start, end, len) \
7936 if (end > len) \
7937 end = len; \
7938 else if (end < 0) { \
7939 end += len; \
7940 if (end < 0) \
7941 end = 0; \
7942 } \
7943 if (start < 0) { \
7944 start += len; \
7945 if (start < 0) \
7946 start = 0; \
7947 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007948
Alexander Belopolsky40018472011-02-26 01:02:56 +00007949Py_ssize_t
7950PyUnicode_Count(PyObject *str,
7951 PyObject *substr,
7952 Py_ssize_t start,
7953 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007955 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007956 PyUnicodeObject* str_obj;
7957 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 int kind1, kind2, kind;
7959 void *buf1 = NULL, *buf2 = NULL;
7960 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007961
Thomas Wouters477c8d52006-05-27 19:21:47 +00007962 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007965 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02007966 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 Py_DECREF(str_obj);
7968 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 }
Tim Petersced69f82003-09-16 20:30:58 +00007970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 kind1 = PyUnicode_KIND(str_obj);
7972 kind2 = PyUnicode_KIND(sub_obj);
7973 kind = kind1 > kind2 ? kind1 : kind2;
7974 buf1 = PyUnicode_DATA(str_obj);
7975 if (kind1 != kind)
7976 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7977 if (!buf1)
7978 goto onError;
7979 buf2 = PyUnicode_DATA(sub_obj);
7980 if (kind2 != kind)
7981 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7982 if (!buf2)
7983 goto onError;
7984 len1 = PyUnicode_GET_LENGTH(str_obj);
7985 len2 = PyUnicode_GET_LENGTH(sub_obj);
7986
7987 ADJUST_INDICES(start, end, len1);
7988 switch(kind) {
7989 case PyUnicode_1BYTE_KIND:
7990 result = ucs1lib_count(
7991 ((Py_UCS1*)buf1) + start, end - start,
7992 buf2, len2, PY_SSIZE_T_MAX
7993 );
7994 break;
7995 case PyUnicode_2BYTE_KIND:
7996 result = ucs2lib_count(
7997 ((Py_UCS2*)buf1) + start, end - start,
7998 buf2, len2, PY_SSIZE_T_MAX
7999 );
8000 break;
8001 case PyUnicode_4BYTE_KIND:
8002 result = ucs4lib_count(
8003 ((Py_UCS4*)buf1) + start, end - start,
8004 buf2, len2, PY_SSIZE_T_MAX
8005 );
8006 break;
8007 default:
8008 assert(0); result = 0;
8009 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008010
8011 Py_DECREF(sub_obj);
8012 Py_DECREF(str_obj);
8013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008014 if (kind1 != kind)
8015 PyMem_Free(buf1);
8016 if (kind2 != kind)
8017 PyMem_Free(buf2);
8018
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008020 onError:
8021 Py_DECREF(sub_obj);
8022 Py_DECREF(str_obj);
8023 if (kind1 != kind && buf1)
8024 PyMem_Free(buf1);
8025 if (kind2 != kind && buf2)
8026 PyMem_Free(buf2);
8027 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028}
8029
Alexander Belopolsky40018472011-02-26 01:02:56 +00008030Py_ssize_t
8031PyUnicode_Find(PyObject *str,
8032 PyObject *sub,
8033 Py_ssize_t start,
8034 Py_ssize_t end,
8035 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008037 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008038
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008040 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008042 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008043 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 Py_DECREF(str);
8045 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 }
Tim Petersced69f82003-09-16 20:30:58 +00008047
Thomas Wouters477c8d52006-05-27 19:21:47 +00008048 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049 result = any_find_slice(
8050 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8051 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008052 );
8053 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008054 result = any_find_slice(
8055 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8056 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008057 );
8058
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008060 Py_DECREF(sub);
8061
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 return result;
8063}
8064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008065Py_ssize_t
8066PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8067 Py_ssize_t start, Py_ssize_t end,
8068 int direction)
8069{
8070 char *result;
8071 int kind;
8072 if (PyUnicode_READY(str) == -1)
8073 return -2;
8074 if (end > PyUnicode_GET_LENGTH(str))
8075 end = PyUnicode_GET_LENGTH(str);
8076 kind = PyUnicode_KIND(str);
8077 result = findchar(PyUnicode_1BYTE_DATA(str)
8078 + PyUnicode_KIND_SIZE(kind, start),
8079 kind,
8080 end-start, ch, direction);
8081 if (!result)
8082 return -1;
8083 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8084}
8085
Alexander Belopolsky40018472011-02-26 01:02:56 +00008086static int
8087tailmatch(PyUnicodeObject *self,
8088 PyUnicodeObject *substring,
8089 Py_ssize_t start,
8090 Py_ssize_t end,
8091 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008093 int kind_self;
8094 int kind_sub;
8095 void *data_self;
8096 void *data_sub;
8097 Py_ssize_t offset;
8098 Py_ssize_t i;
8099 Py_ssize_t end_sub;
8100
8101 if (PyUnicode_READY(self) == -1 ||
8102 PyUnicode_READY(substring) == -1)
8103 return 0;
8104
8105 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 return 1;
8107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8109 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 kind_self = PyUnicode_KIND(self);
8114 data_self = PyUnicode_DATA(self);
8115 kind_sub = PyUnicode_KIND(substring);
8116 data_sub = PyUnicode_DATA(substring);
8117 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8118
8119 if (direction > 0)
8120 offset = end;
8121 else
8122 offset = start;
8123
8124 if (PyUnicode_READ(kind_self, data_self, offset) ==
8125 PyUnicode_READ(kind_sub, data_sub, 0) &&
8126 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8127 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8128 /* If both are of the same kind, memcmp is sufficient */
8129 if (kind_self == kind_sub) {
8130 return ! memcmp((char *)data_self +
8131 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8132 data_sub,
8133 PyUnicode_GET_LENGTH(substring) *
8134 PyUnicode_CHARACTER_SIZE(substring));
8135 }
8136 /* otherwise we have to compare each character by first accesing it */
8137 else {
8138 /* We do not need to compare 0 and len(substring)-1 because
8139 the if statement above ensured already that they are equal
8140 when we end up here. */
8141 // TODO: honor direction and do a forward or backwards search
8142 for (i = 1; i < end_sub; ++i) {
8143 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8144 PyUnicode_READ(kind_sub, data_sub, i))
8145 return 0;
8146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 }
8150
8151 return 0;
8152}
8153
Alexander Belopolsky40018472011-02-26 01:02:56 +00008154Py_ssize_t
8155PyUnicode_Tailmatch(PyObject *str,
8156 PyObject *substr,
8157 Py_ssize_t start,
8158 Py_ssize_t end,
8159 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008161 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008162
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 str = PyUnicode_FromObject(str);
8164 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 substr = PyUnicode_FromObject(substr);
8167 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 Py_DECREF(str);
8169 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 }
Tim Petersced69f82003-09-16 20:30:58 +00008171
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 (PyUnicodeObject *)substr,
8174 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 Py_DECREF(str);
8176 Py_DECREF(substr);
8177 return result;
8178}
8179
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180/* Apply fixfct filter to the Unicode object self and return a
8181 reference to the modified object */
8182
Alexander Belopolsky40018472011-02-26 01:02:56 +00008183static PyObject *
8184fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 PyObject *u;
8188 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008190 if (PyUnicode_READY(self) == -1)
8191 return NULL;
8192 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8193 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8194 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8199 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201 /* fix functions return the new maximum character in a string,
8202 if the kind of the resulting unicode object does not change,
8203 everything is fine. Otherwise we need to change the string kind
8204 and re-run the fix function. */
8205 maxchar_new = fixfct((PyUnicodeObject*)u);
8206 if (maxchar_new == 0)
8207 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8208 else if (maxchar_new <= 127)
8209 maxchar_new = 127;
8210 else if (maxchar_new <= 255)
8211 maxchar_new = 255;
8212 else if (maxchar_new <= 65535)
8213 maxchar_new = 65535;
8214 else
8215 maxchar_new = 1114111; /* 0x10ffff */
8216
8217 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 /* fixfct should return TRUE if it modified the buffer. If
8219 FALSE, return a reference to the original buffer instead
8220 (to save space, not time) */
8221 Py_INCREF(self);
8222 Py_DECREF(u);
8223 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225 else if (maxchar_new == maxchar_old) {
8226 return u;
8227 }
8228 else {
8229 /* In case the maximum character changed, we need to
8230 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008231 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 if (v == NULL) {
8233 Py_DECREF(u);
8234 return NULL;
8235 }
8236 if (maxchar_new > maxchar_old) {
8237 /* If the maxchar increased so that the kind changed, not all
8238 characters are representable anymore and we need to fix the
8239 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008240 if (PyUnicode_CopyCharacters(v, 0,
8241 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008242 PyUnicode_GET_LENGTH(self)) < 0)
8243 {
8244 Py_DECREF(u);
8245 return NULL;
8246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247 maxchar_old = fixfct((PyUnicodeObject*)v);
8248 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8249 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008250 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008251 if (PyUnicode_CopyCharacters(v, 0,
8252 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008253 PyUnicode_GET_LENGTH(self)) < 0)
8254 {
8255 Py_DECREF(u);
8256 return NULL;
8257 }
8258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259
8260 Py_DECREF(u);
8261 return v;
8262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263}
8264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008266fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268 /* No need to call PyUnicode_READY(self) because this function is only
8269 called as a callback from fixup() which does it already. */
8270 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8271 const int kind = PyUnicode_KIND(self);
8272 void *data = PyUnicode_DATA(self);
8273 int touched = 0;
8274 Py_UCS4 maxchar = 0;
8275 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 for (i = 0; i < len; ++i) {
8278 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8279 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8280 if (up != ch) {
8281 if (up > maxchar)
8282 maxchar = up;
8283 PyUnicode_WRITE(kind, data, i, up);
8284 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 else if (ch > maxchar)
8287 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 }
8289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 if (touched)
8291 return maxchar;
8292 else
8293 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294}
8295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008297fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8300 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8301 const int kind = PyUnicode_KIND(self);
8302 void *data = PyUnicode_DATA(self);
8303 int touched = 0;
8304 Py_UCS4 maxchar = 0;
8305 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 for(i = 0; i < len; ++i) {
8308 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8309 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8310 if (lo != ch) {
8311 if (lo > maxchar)
8312 maxchar = lo;
8313 PyUnicode_WRITE(kind, data, i, lo);
8314 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008316 else if (ch > maxchar)
8317 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 }
8319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 if (touched)
8321 return maxchar;
8322 else
8323 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324}
8325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008327fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8330 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8331 const int kind = PyUnicode_KIND(self);
8332 void *data = PyUnicode_DATA(self);
8333 int touched = 0;
8334 Py_UCS4 maxchar = 0;
8335 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 for(i = 0; i < len; ++i) {
8338 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8339 Py_UCS4 nu = 0;
8340
8341 if (Py_UNICODE_ISUPPER(ch))
8342 nu = Py_UNICODE_TOLOWER(ch);
8343 else if (Py_UNICODE_ISLOWER(ch))
8344 nu = Py_UNICODE_TOUPPER(ch);
8345
8346 if (nu != 0) {
8347 if (nu > maxchar)
8348 maxchar = nu;
8349 PyUnicode_WRITE(kind, data, i, nu);
8350 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 else if (ch > maxchar)
8353 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
8355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 if (touched)
8357 return maxchar;
8358 else
8359 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360}
8361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8366 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8367 const int kind = PyUnicode_KIND(self);
8368 void *data = PyUnicode_DATA(self);
8369 int touched = 0;
8370 Py_UCS4 maxchar = 0;
8371 Py_ssize_t i = 0;
8372 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008373
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008374 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376
8377 ch = PyUnicode_READ(kind, data, i);
8378 if (!Py_UNICODE_ISUPPER(ch)) {
8379 maxchar = Py_UNICODE_TOUPPER(ch);
8380 PyUnicode_WRITE(kind, data, i, maxchar);
8381 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 ++i;
8384 for(; i < len; ++i) {
8385 ch = PyUnicode_READ(kind, data, i);
8386 if (!Py_UNICODE_ISLOWER(ch)) {
8387 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8388 if (lo > maxchar)
8389 maxchar = lo;
8390 PyUnicode_WRITE(kind, data, i, lo);
8391 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 else if (ch > maxchar)
8394 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396
8397 if (touched)
8398 return maxchar;
8399 else
8400 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401}
8402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008404fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8407 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8408 const int kind = PyUnicode_KIND(self);
8409 void *data = PyUnicode_DATA(self);
8410 Py_UCS4 maxchar = 0;
8411 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 int previous_is_cased;
8413
8414 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 if (len == 1) {
8416 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8417 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8418 if (ti != ch) {
8419 PyUnicode_WRITE(kind, data, i, ti);
8420 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 }
8422 else
8423 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 for(; i < len; ++i) {
8427 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8428 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008429
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 nu = Py_UNICODE_TOTITLE(ch);
8434
8435 if (nu > maxchar)
8436 maxchar = nu;
8437 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008438
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 if (Py_UNICODE_ISLOWER(ch) ||
8440 Py_UNICODE_ISUPPER(ch) ||
8441 Py_UNICODE_ISTITLE(ch))
8442 previous_is_cased = 1;
8443 else
8444 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447}
8448
Tim Peters8ce9f162004-08-27 01:49:32 +00008449PyObject *
8450PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008453 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008455 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008456 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8457 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008458 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 Py_ssize_t sz, i, res_offset;
8460 Py_UCS4 maxchar = 0;
8461 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462
Tim Peters05eba1f2004-08-27 21:32:02 +00008463 fseq = PySequence_Fast(seq, "");
8464 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008465 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008466 }
8467
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008468 /* NOTE: the following code can't call back into Python code,
8469 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008470 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008471
Tim Peters05eba1f2004-08-27 21:32:02 +00008472 seqlen = PySequence_Fast_GET_SIZE(fseq);
8473 /* If empty sequence, return u"". */
8474 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008477 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008478 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008479 /* If singleton sequence with an exact Unicode, return that. */
8480 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 item = items[0];
8482 if (PyUnicode_CheckExact(item)) {
8483 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 goto Done;
8486 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008487 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008488 else {
8489 /* Set up sep and seplen */
8490 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 /* fall back to a blank space separator */
8492 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008493 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008495 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008496 else {
8497 if (!PyUnicode_Check(separator)) {
8498 PyErr_Format(PyExc_TypeError,
8499 "separator: expected str instance,"
8500 " %.80s found",
8501 Py_TYPE(separator)->tp_name);
8502 goto onError;
8503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 if (PyUnicode_READY(separator) == -1)
8505 goto onError;
8506 sep = separator;
8507 seplen = PyUnicode_GET_LENGTH(separator);
8508 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8509 /* inc refcount to keep this code path symetric with the
8510 above case of a blank separator */
8511 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008512 }
8513 }
8514
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008515 /* There are at least two things to join, or else we have a subclass
8516 * of str in the sequence.
8517 * Do a pre-pass to figure out the total amount of space we'll
8518 * need (sz), and see whether all argument are strings.
8519 */
8520 sz = 0;
8521 for (i = 0; i < seqlen; i++) {
8522 const Py_ssize_t old_sz = sz;
8523 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 if (!PyUnicode_Check(item)) {
8525 PyErr_Format(PyExc_TypeError,
8526 "sequence item %zd: expected str instance,"
8527 " %.80s found",
8528 i, Py_TYPE(item)->tp_name);
8529 goto onError;
8530 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 if (PyUnicode_READY(item) == -1)
8532 goto onError;
8533 sz += PyUnicode_GET_LENGTH(item);
8534 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8535 if (item_maxchar > maxchar)
8536 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008537 if (i != 0)
8538 sz += seplen;
8539 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8540 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008542 goto onError;
8543 }
8544 }
Tim Petersced69f82003-09-16 20:30:58 +00008545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008547 if (res == NULL)
8548 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008549
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008550 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008552 Py_ssize_t itemlen;
8553 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 /* Copy item, and maybe the separator. */
8556 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008557 if (PyUnicode_CopyCharacters(res, res_offset,
8558 sep, 0, seplen) < 0)
8559 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008562 if (PyUnicode_CopyCharacters(res, res_offset,
8563 item, 0, itemlen) < 0)
8564 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008568
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008570 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 Py_XDECREF(sep);
8572 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008575 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008577 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 return NULL;
8579}
8580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581#define FILL(kind, data, value, start, length) \
8582 do { \
8583 Py_ssize_t i_ = 0; \
8584 assert(kind != PyUnicode_WCHAR_KIND); \
8585 switch ((kind)) { \
8586 case PyUnicode_1BYTE_KIND: { \
8587 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8588 memset(to_, (unsigned char)value, length); \
8589 break; \
8590 } \
8591 case PyUnicode_2BYTE_KIND: { \
8592 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8593 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8594 break; \
8595 } \
8596 default: { \
8597 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8598 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8599 break; \
8600 } \
8601 } \
8602 } while (0)
8603
Alexander Belopolsky40018472011-02-26 01:02:56 +00008604static PyUnicodeObject *
8605pad(PyUnicodeObject *self,
8606 Py_ssize_t left,
8607 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 PyObject *u;
8611 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008612 int kind;
8613 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614
8615 if (left < 0)
8616 left = 0;
8617 if (right < 0)
8618 right = 0;
8619
Tim Peters7a29bd52001-09-12 03:03:31 +00008620 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 Py_INCREF(self);
8622 return self;
8623 }
8624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8626 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008627 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8628 return NULL;
8629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8631 if (fill > maxchar)
8632 maxchar = fill;
8633 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008634 if (!u)
8635 return NULL;
8636
8637 kind = PyUnicode_KIND(u);
8638 data = PyUnicode_DATA(u);
8639 if (left)
8640 FILL(kind, data, fill, 0, left);
8641 if (right)
8642 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008643 if (PyUnicode_CopyCharacters(u, left,
8644 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008645 _PyUnicode_LENGTH(self)) < 0)
8646 {
8647 Py_DECREF(u);
8648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 }
8650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655PyObject *
8656PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659
8660 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 switch(PyUnicode_KIND(string)) {
8665 case PyUnicode_1BYTE_KIND:
8666 list = ucs1lib_splitlines(
8667 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8668 PyUnicode_GET_LENGTH(string), keepends);
8669 break;
8670 case PyUnicode_2BYTE_KIND:
8671 list = ucs2lib_splitlines(
8672 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8673 PyUnicode_GET_LENGTH(string), keepends);
8674 break;
8675 case PyUnicode_4BYTE_KIND:
8676 list = ucs4lib_splitlines(
8677 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8678 PyUnicode_GET_LENGTH(string), keepends);
8679 break;
8680 default:
8681 assert(0);
8682 list = 0;
8683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 Py_DECREF(string);
8685 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686}
8687
Alexander Belopolsky40018472011-02-26 01:02:56 +00008688static PyObject *
8689split(PyUnicodeObject *self,
8690 PyUnicodeObject *substring,
8691 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 int kind1, kind2, kind;
8694 void *buf1, *buf2;
8695 Py_ssize_t len1, len2;
8696 PyObject* out;
8697
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008699 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 if (PyUnicode_READY(self) == -1)
8702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 if (substring == NULL)
8705 switch(PyUnicode_KIND(self)) {
8706 case PyUnicode_1BYTE_KIND:
8707 return ucs1lib_split_whitespace(
8708 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8709 PyUnicode_GET_LENGTH(self), maxcount
8710 );
8711 case PyUnicode_2BYTE_KIND:
8712 return ucs2lib_split_whitespace(
8713 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8714 PyUnicode_GET_LENGTH(self), maxcount
8715 );
8716 case PyUnicode_4BYTE_KIND:
8717 return ucs4lib_split_whitespace(
8718 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8719 PyUnicode_GET_LENGTH(self), maxcount
8720 );
8721 default:
8722 assert(0);
8723 return NULL;
8724 }
8725
8726 if (PyUnicode_READY(substring) == -1)
8727 return NULL;
8728
8729 kind1 = PyUnicode_KIND(self);
8730 kind2 = PyUnicode_KIND(substring);
8731 kind = kind1 > kind2 ? kind1 : kind2;
8732 buf1 = PyUnicode_DATA(self);
8733 buf2 = PyUnicode_DATA(substring);
8734 if (kind1 != kind)
8735 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8736 if (!buf1)
8737 return NULL;
8738 if (kind2 != kind)
8739 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8740 if (!buf2) {
8741 if (kind1 != kind) PyMem_Free(buf1);
8742 return NULL;
8743 }
8744 len1 = PyUnicode_GET_LENGTH(self);
8745 len2 = PyUnicode_GET_LENGTH(substring);
8746
8747 switch(kind) {
8748 case PyUnicode_1BYTE_KIND:
8749 out = ucs1lib_split(
8750 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8751 break;
8752 case PyUnicode_2BYTE_KIND:
8753 out = ucs2lib_split(
8754 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8755 break;
8756 case PyUnicode_4BYTE_KIND:
8757 out = ucs4lib_split(
8758 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8759 break;
8760 default:
8761 out = NULL;
8762 }
8763 if (kind1 != kind)
8764 PyMem_Free(buf1);
8765 if (kind2 != kind)
8766 PyMem_Free(buf2);
8767 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768}
8769
Alexander Belopolsky40018472011-02-26 01:02:56 +00008770static PyObject *
8771rsplit(PyUnicodeObject *self,
8772 PyUnicodeObject *substring,
8773 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 int kind1, kind2, kind;
8776 void *buf1, *buf2;
8777 Py_ssize_t len1, len2;
8778 PyObject* out;
8779
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008780 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008781 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 if (PyUnicode_READY(self) == -1)
8784 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 if (substring == NULL)
8787 switch(PyUnicode_KIND(self)) {
8788 case PyUnicode_1BYTE_KIND:
8789 return ucs1lib_rsplit_whitespace(
8790 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8791 PyUnicode_GET_LENGTH(self), maxcount
8792 );
8793 case PyUnicode_2BYTE_KIND:
8794 return ucs2lib_rsplit_whitespace(
8795 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8796 PyUnicode_GET_LENGTH(self), maxcount
8797 );
8798 case PyUnicode_4BYTE_KIND:
8799 return ucs4lib_rsplit_whitespace(
8800 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8801 PyUnicode_GET_LENGTH(self), maxcount
8802 );
8803 default:
8804 assert(0);
8805 return NULL;
8806 }
8807
8808 if (PyUnicode_READY(substring) == -1)
8809 return NULL;
8810
8811 kind1 = PyUnicode_KIND(self);
8812 kind2 = PyUnicode_KIND(substring);
8813 kind = kind1 > kind2 ? kind1 : kind2;
8814 buf1 = PyUnicode_DATA(self);
8815 buf2 = PyUnicode_DATA(substring);
8816 if (kind1 != kind)
8817 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8818 if (!buf1)
8819 return NULL;
8820 if (kind2 != kind)
8821 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8822 if (!buf2) {
8823 if (kind1 != kind) PyMem_Free(buf1);
8824 return NULL;
8825 }
8826 len1 = PyUnicode_GET_LENGTH(self);
8827 len2 = PyUnicode_GET_LENGTH(substring);
8828
8829 switch(kind) {
8830 case PyUnicode_1BYTE_KIND:
8831 out = ucs1lib_rsplit(
8832 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8833 break;
8834 case PyUnicode_2BYTE_KIND:
8835 out = ucs2lib_rsplit(
8836 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8837 break;
8838 case PyUnicode_4BYTE_KIND:
8839 out = ucs4lib_rsplit(
8840 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8841 break;
8842 default:
8843 out = NULL;
8844 }
8845 if (kind1 != kind)
8846 PyMem_Free(buf1);
8847 if (kind2 != kind)
8848 PyMem_Free(buf2);
8849 return out;
8850}
8851
8852static Py_ssize_t
8853anylib_find(int kind, void *buf1, Py_ssize_t len1,
8854 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8855{
8856 switch(kind) {
8857 case PyUnicode_1BYTE_KIND:
8858 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8859 case PyUnicode_2BYTE_KIND:
8860 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8861 case PyUnicode_4BYTE_KIND:
8862 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8863 }
8864 assert(0);
8865 return -1;
8866}
8867
8868static Py_ssize_t
8869anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8870 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8871{
8872 switch(kind) {
8873 case PyUnicode_1BYTE_KIND:
8874 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8875 case PyUnicode_2BYTE_KIND:
8876 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8877 case PyUnicode_4BYTE_KIND:
8878 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8879 }
8880 assert(0);
8881 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008882}
8883
Alexander Belopolsky40018472011-02-26 01:02:56 +00008884static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885replace(PyObject *self, PyObject *str1,
8886 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 PyObject *u;
8889 char *sbuf = PyUnicode_DATA(self);
8890 char *buf1 = PyUnicode_DATA(str1);
8891 char *buf2 = PyUnicode_DATA(str2);
8892 int srelease = 0, release1 = 0, release2 = 0;
8893 int skind = PyUnicode_KIND(self);
8894 int kind1 = PyUnicode_KIND(str1);
8895 int kind2 = PyUnicode_KIND(str2);
8896 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8897 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8898 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899
8900 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008903 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 if (skind < kind1)
8906 /* substring too wide to be present */
8907 goto nothing;
8908
8909 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008910 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008911 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008913 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008915 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 Py_UCS4 u1, u2, maxchar;
8917 int mayshrink, rkind;
8918 u1 = PyUnicode_READ_CHAR(str1, 0);
8919 if (!findchar(sbuf, PyUnicode_KIND(self),
8920 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008921 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 u2 = PyUnicode_READ_CHAR(str2, 0);
8923 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8924 /* Replacing u1 with u2 may cause a maxchar reduction in the
8925 result string. */
8926 mayshrink = maxchar > 127;
8927 if (u2 > maxchar) {
8928 maxchar = u2;
8929 mayshrink = 0;
8930 }
8931 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008932 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008934 if (PyUnicode_CopyCharacters(u, 0,
8935 (PyObject*)self, 0, slen) < 0)
8936 {
8937 Py_DECREF(u);
8938 return NULL;
8939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 rkind = PyUnicode_KIND(u);
8941 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8942 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008943 if (--maxcount < 0)
8944 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 if (mayshrink) {
8948 PyObject *tmp = u;
8949 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8950 PyUnicode_GET_LENGTH(tmp));
8951 Py_DECREF(tmp);
8952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 int rkind = skind;
8955 char *res;
8956 if (kind1 < rkind) {
8957 /* widen substring */
8958 buf1 = _PyUnicode_AsKind(str1, rkind);
8959 if (!buf1) goto error;
8960 release1 = 1;
8961 }
8962 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008963 if (i < 0)
8964 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 if (rkind > kind2) {
8966 /* widen replacement */
8967 buf2 = _PyUnicode_AsKind(str2, rkind);
8968 if (!buf2) goto error;
8969 release2 = 1;
8970 }
8971 else if (rkind < kind2) {
8972 /* widen self and buf1 */
8973 rkind = kind2;
8974 if (release1) PyMem_Free(buf1);
8975 sbuf = _PyUnicode_AsKind(self, rkind);
8976 if (!sbuf) goto error;
8977 srelease = 1;
8978 buf1 = _PyUnicode_AsKind(str1, rkind);
8979 if (!buf1) goto error;
8980 release1 = 1;
8981 }
8982 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8983 if (!res) {
8984 PyErr_NoMemory();
8985 goto error;
8986 }
8987 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008988 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8990 buf2,
8991 PyUnicode_KIND_SIZE(rkind, len2));
8992 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008993
8994 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8996 slen-i,
8997 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008998 if (i == -1)
8999 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9001 buf2,
9002 PyUnicode_KIND_SIZE(rkind, len2));
9003 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005
9006 u = PyUnicode_FromKindAndData(rkind, res, slen);
9007 PyMem_Free(res);
9008 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 Py_ssize_t n, i, j, ires;
9013 Py_ssize_t product, new_size;
9014 int rkind = skind;
9015 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 if (kind1 < rkind) {
9018 buf1 = _PyUnicode_AsKind(str1, rkind);
9019 if (!buf1) goto error;
9020 release1 = 1;
9021 }
9022 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009023 if (n == 0)
9024 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 if (kind2 < rkind) {
9026 buf2 = _PyUnicode_AsKind(str2, rkind);
9027 if (!buf2) goto error;
9028 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 else if (kind2 > rkind) {
9031 rkind = kind2;
9032 sbuf = _PyUnicode_AsKind(self, rkind);
9033 if (!sbuf) goto error;
9034 srelease = 1;
9035 if (release1) PyMem_Free(buf1);
9036 buf1 = _PyUnicode_AsKind(str1, rkind);
9037 if (!buf1) goto error;
9038 release1 = 1;
9039 }
9040 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9041 PyUnicode_GET_LENGTH(str1))); */
9042 product = n * (len2-len1);
9043 if ((product / (len2-len1)) != n) {
9044 PyErr_SetString(PyExc_OverflowError,
9045 "replace string is too long");
9046 goto error;
9047 }
9048 new_size = slen + product;
9049 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9050 PyErr_SetString(PyExc_OverflowError,
9051 "replace string is too long");
9052 goto error;
9053 }
9054 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9055 if (!res)
9056 goto error;
9057 ires = i = 0;
9058 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009059 while (n-- > 0) {
9060 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 j = anylib_find(rkind,
9062 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9063 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009064 if (j == -1)
9065 break;
9066 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009067 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9069 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9070 PyUnicode_KIND_SIZE(rkind, j-i));
9071 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009072 }
9073 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 if (len2 > 0) {
9075 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9076 buf2,
9077 PyUnicode_KIND_SIZE(rkind, len2));
9078 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009083 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9085 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9086 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009087 } else {
9088 /* interleave */
9089 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9091 buf2,
9092 PyUnicode_KIND_SIZE(rkind, len2));
9093 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009094 if (--n <= 0)
9095 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9097 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9098 PyUnicode_KIND_SIZE(rkind, 1));
9099 ires++;
9100 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9103 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9104 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009107 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 if (srelease)
9110 PyMem_FREE(sbuf);
9111 if (release1)
9112 PyMem_FREE(buf1);
9113 if (release2)
9114 PyMem_FREE(buf2);
9115 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009116
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009118 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 if (srelease)
9120 PyMem_FREE(sbuf);
9121 if (release1)
9122 PyMem_FREE(buf1);
9123 if (release2)
9124 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009125 if (PyUnicode_CheckExact(self)) {
9126 Py_INCREF(self);
9127 return (PyObject *) self;
9128 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009129 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 error:
9131 if (srelease && sbuf)
9132 PyMem_FREE(sbuf);
9133 if (release1 && buf1)
9134 PyMem_FREE(buf1);
9135 if (release2 && buf2)
9136 PyMem_FREE(buf2);
9137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138}
9139
9140/* --- Unicode Object Methods --------------------------------------------- */
9141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009142PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144\n\
9145Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009146characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147
9148static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009149unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 return fixup(self, fixtitle);
9152}
9153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009154PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156\n\
9157Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009158have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159
9160static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009161unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163 return fixup(self, fixcapitalize);
9164}
9165
9166#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009167PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169\n\
9170Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009171normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172
9173static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009174unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175{
9176 PyObject *list;
9177 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009178 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 /* Split into words */
9181 list = split(self, NULL, -1);
9182 if (!list)
9183 return NULL;
9184
9185 /* Capitalize each word */
9186 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9187 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189 if (item == NULL)
9190 goto onError;
9191 Py_DECREF(PyList_GET_ITEM(list, i));
9192 PyList_SET_ITEM(list, i, item);
9193 }
9194
9195 /* Join the words to form a new string */
9196 item = PyUnicode_Join(NULL, list);
9197
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009199 Py_DECREF(list);
9200 return (PyObject *)item;
9201}
9202#endif
9203
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009204/* Argument converter. Coerces to a single unicode character */
9205
9206static int
9207convert_uc(PyObject *obj, void *addr)
9208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009210 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009211
Benjamin Peterson14339b62009-01-31 16:36:08 +00009212 uniobj = PyUnicode_FromObject(obj);
9213 if (uniobj == NULL) {
9214 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009216 return 0;
9217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009219 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009221 Py_DECREF(uniobj);
9222 return 0;
9223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009225 Py_DECREF(uniobj);
9226 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009227}
9228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009229PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009230 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009232Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009233done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234
9235static PyObject *
9236unicode_center(PyUnicodeObject *self, PyObject *args)
9237{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009238 Py_ssize_t marg, left;
9239 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 Py_UCS4 fillchar = ' ';
9241
Victor Stinnere9a29352011-10-01 02:14:59 +02009242 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244
Victor Stinnere9a29352011-10-01 02:14:59 +02009245 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 return NULL;
9247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 Py_INCREF(self);
9250 return (PyObject*) self;
9251 }
9252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254 left = marg / 2 + (marg & width & 1);
9255
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009256 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257}
9258
Marc-André Lemburge5034372000-08-08 08:04:29 +00009259#if 0
9260
9261/* This code should go into some future Unicode collation support
9262 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009263 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009264
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009265/* speedy UTF-16 code point order comparison */
9266/* gleaned from: */
9267/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9268
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009269static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009270{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009271 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009272 0, 0, 0, 0, 0, 0, 0, 0,
9273 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009274 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009275};
9276
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277static int
9278unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9279{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009280 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009281
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282 Py_UNICODE *s1 = str1->str;
9283 Py_UNICODE *s2 = str2->str;
9284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 len1 = str1->_base._base.length;
9286 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009287
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009289 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009290
9291 c1 = *s1++;
9292 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009293
Benjamin Peterson29060642009-01-31 22:14:21 +00009294 if (c1 > (1<<11) * 26)
9295 c1 += utf16Fixup[c1>>11];
9296 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009297 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009298 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009299
9300 if (c1 != c2)
9301 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009302
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009303 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 }
9305
9306 return (len1 < len2) ? -1 : (len1 != len2);
9307}
9308
Marc-André Lemburge5034372000-08-08 08:04:29 +00009309#else
9310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311/* This function assumes that str1 and str2 are readied by the caller. */
9312
Marc-André Lemburge5034372000-08-08 08:04:29 +00009313static int
9314unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9315{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 int kind1, kind2;
9317 void *data1, *data2;
9318 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 kind1 = PyUnicode_KIND(str1);
9321 kind2 = PyUnicode_KIND(str2);
9322 data1 = PyUnicode_DATA(str1);
9323 data2 = PyUnicode_DATA(str2);
9324 len1 = PyUnicode_GET_LENGTH(str1);
9325 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 for (i = 0; i < len1 && i < len2; ++i) {
9328 Py_UCS4 c1, c2;
9329 c1 = PyUnicode_READ(kind1, data1, i);
9330 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009331
9332 if (c1 != c2)
9333 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009334 }
9335
9336 return (len1 < len2) ? -1 : (len1 != len2);
9337}
9338
9339#endif
9340
Alexander Belopolsky40018472011-02-26 01:02:56 +00009341int
9342PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9345 if (PyUnicode_READY(left) == -1 ||
9346 PyUnicode_READY(right) == -1)
9347 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009348 return unicode_compare((PyUnicodeObject *)left,
9349 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009351 PyErr_Format(PyExc_TypeError,
9352 "Can't compare %.100s and %.100s",
9353 left->ob_type->tp_name,
9354 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355 return -1;
9356}
9357
Martin v. Löwis5b222132007-06-10 09:51:05 +00009358int
9359PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 Py_ssize_t i;
9362 int kind;
9363 void *data;
9364 Py_UCS4 chr;
9365
Martin v. Löwis5b222132007-06-10 09:51:05 +00009366 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 if (PyUnicode_READY(uni) == -1)
9368 return -1;
9369 kind = PyUnicode_KIND(uni);
9370 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009371 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9373 if (chr != str[i])
9374 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009375 /* This check keeps Python strings that end in '\0' from comparing equal
9376 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009378 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009379 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009381 return 0;
9382}
9383
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009384
Benjamin Peterson29060642009-01-31 22:14:21 +00009385#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009386 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009387
Alexander Belopolsky40018472011-02-26 01:02:56 +00009388PyObject *
9389PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009390{
9391 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009392
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009393 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9394 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 if (PyUnicode_READY(left) == -1 ||
9396 PyUnicode_READY(right) == -1)
9397 return NULL;
9398 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9399 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009400 if (op == Py_EQ) {
9401 Py_INCREF(Py_False);
9402 return Py_False;
9403 }
9404 if (op == Py_NE) {
9405 Py_INCREF(Py_True);
9406 return Py_True;
9407 }
9408 }
9409 if (left == right)
9410 result = 0;
9411 else
9412 result = unicode_compare((PyUnicodeObject *)left,
9413 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009414
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009415 /* Convert the return value to a Boolean */
9416 switch (op) {
9417 case Py_EQ:
9418 v = TEST_COND(result == 0);
9419 break;
9420 case Py_NE:
9421 v = TEST_COND(result != 0);
9422 break;
9423 case Py_LE:
9424 v = TEST_COND(result <= 0);
9425 break;
9426 case Py_GE:
9427 v = TEST_COND(result >= 0);
9428 break;
9429 case Py_LT:
9430 v = TEST_COND(result == -1);
9431 break;
9432 case Py_GT:
9433 v = TEST_COND(result == 1);
9434 break;
9435 default:
9436 PyErr_BadArgument();
9437 return NULL;
9438 }
9439 Py_INCREF(v);
9440 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009442
Brian Curtindfc80e32011-08-10 20:28:54 -05009443 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009444}
9445
Alexander Belopolsky40018472011-02-26 01:02:56 +00009446int
9447PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009448{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009449 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 int kind1, kind2, kind;
9451 void *buf1, *buf2;
9452 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009453 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009454
9455 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009456 sub = PyUnicode_FromObject(element);
9457 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 PyErr_Format(PyExc_TypeError,
9459 "'in <string>' requires string as left operand, not %s",
9460 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009461 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 if (PyUnicode_READY(sub) == -1)
9464 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009465
Thomas Wouters477c8d52006-05-27 19:21:47 +00009466 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009467 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009468 Py_DECREF(sub);
9469 return -1;
9470 }
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 kind1 = PyUnicode_KIND(str);
9473 kind2 = PyUnicode_KIND(sub);
9474 kind = kind1 > kind2 ? kind1 : kind2;
9475 buf1 = PyUnicode_DATA(str);
9476 buf2 = PyUnicode_DATA(sub);
9477 if (kind1 != kind)
9478 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9479 if (!buf1) {
9480 Py_DECREF(sub);
9481 return -1;
9482 }
9483 if (kind2 != kind)
9484 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9485 if (!buf2) {
9486 Py_DECREF(sub);
9487 if (kind1 != kind) PyMem_Free(buf1);
9488 return -1;
9489 }
9490 len1 = PyUnicode_GET_LENGTH(str);
9491 len2 = PyUnicode_GET_LENGTH(sub);
9492
9493 switch(kind) {
9494 case PyUnicode_1BYTE_KIND:
9495 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9496 break;
9497 case PyUnicode_2BYTE_KIND:
9498 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9499 break;
9500 case PyUnicode_4BYTE_KIND:
9501 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9502 break;
9503 default:
9504 result = -1;
9505 assert(0);
9506 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009507
9508 Py_DECREF(str);
9509 Py_DECREF(sub);
9510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 if (kind1 != kind)
9512 PyMem_Free(buf1);
9513 if (kind2 != kind)
9514 PyMem_Free(buf2);
9515
Guido van Rossum403d68b2000-03-13 15:55:09 +00009516 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009517}
9518
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519/* Concat to string or Unicode object giving a new Unicode object. */
9520
Alexander Belopolsky40018472011-02-26 01:02:56 +00009521PyObject *
9522PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 PyObject *u = NULL, *v = NULL, *w;
9525 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526
9527 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009530 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534
9535 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 }
9544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009546 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 w = PyUnicode_New(
9550 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9551 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009554 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9555 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009556 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009557 v, 0,
9558 PyUnicode_GET_LENGTH(v)) < 0)
9559 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560 Py_DECREF(u);
9561 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563
Benjamin Peterson29060642009-01-31 22:14:21 +00009564 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565 Py_XDECREF(u);
9566 Py_XDECREF(v);
9567 return NULL;
9568}
9569
Walter Dörwald1ab83302007-05-18 17:15:44 +00009570void
9571PyUnicode_Append(PyObject **pleft, PyObject *right)
9572{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009573 PyObject *new;
9574 if (*pleft == NULL)
9575 return;
9576 if (right == NULL || !PyUnicode_Check(*pleft)) {
9577 Py_DECREF(*pleft);
9578 *pleft = NULL;
9579 return;
9580 }
9581 new = PyUnicode_Concat(*pleft, right);
9582 Py_DECREF(*pleft);
9583 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009584}
9585
9586void
9587PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9588{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009589 PyUnicode_Append(pleft, right);
9590 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009591}
9592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009593PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009596Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009597string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009598interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599
9600static PyObject *
9601unicode_count(PyUnicodeObject *self, PyObject *args)
9602{
9603 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009604 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009605 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 int kind1, kind2, kind;
9608 void *buf1, *buf2;
9609 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610
Jesus Ceaac451502011-04-20 17:09:23 +02009611 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9612 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 kind1 = PyUnicode_KIND(self);
9616 kind2 = PyUnicode_KIND(substring);
9617 kind = kind1 > kind2 ? kind1 : kind2;
9618 buf1 = PyUnicode_DATA(self);
9619 buf2 = PyUnicode_DATA(substring);
9620 if (kind1 != kind)
9621 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9622 if (!buf1) {
9623 Py_DECREF(substring);
9624 return NULL;
9625 }
9626 if (kind2 != kind)
9627 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9628 if (!buf2) {
9629 Py_DECREF(substring);
9630 if (kind1 != kind) PyMem_Free(buf1);
9631 return NULL;
9632 }
9633 len1 = PyUnicode_GET_LENGTH(self);
9634 len2 = PyUnicode_GET_LENGTH(substring);
9635
9636 ADJUST_INDICES(start, end, len1);
9637 switch(kind) {
9638 case PyUnicode_1BYTE_KIND:
9639 iresult = ucs1lib_count(
9640 ((Py_UCS1*)buf1) + start, end - start,
9641 buf2, len2, PY_SSIZE_T_MAX
9642 );
9643 break;
9644 case PyUnicode_2BYTE_KIND:
9645 iresult = ucs2lib_count(
9646 ((Py_UCS2*)buf1) + start, end - start,
9647 buf2, len2, PY_SSIZE_T_MAX
9648 );
9649 break;
9650 case PyUnicode_4BYTE_KIND:
9651 iresult = ucs4lib_count(
9652 ((Py_UCS4*)buf1) + start, end - start,
9653 buf2, len2, PY_SSIZE_T_MAX
9654 );
9655 break;
9656 default:
9657 assert(0); iresult = 0;
9658 }
9659
9660 result = PyLong_FromSsize_t(iresult);
9661
9662 if (kind1 != kind)
9663 PyMem_Free(buf1);
9664 if (kind2 != kind)
9665 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666
9667 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009668
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669 return result;
9670}
9671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009672PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009673 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009675Encode S using the codec registered for encoding. Default encoding\n\
9676is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009677handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009678a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9679'xmlcharrefreplace' as well as any other name registered with\n\
9680codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681
9682static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009683unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009685 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686 char *encoding = NULL;
9687 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009688
Benjamin Peterson308d6372009-09-18 21:42:35 +00009689 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9690 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009692 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009693}
9694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009695PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009696 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697\n\
9698Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009699If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700
9701static PyObject*
9702unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9703{
9704 Py_UNICODE *e;
9705 Py_UNICODE *p;
9706 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009707 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 PyUnicodeObject *u;
9710 int tabsize = 8;
9711
9712 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9716 return NULL;
9717
Thomas Wouters7e474022000-07-16 12:04:32 +00009718 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009719 i = 0; /* chars up to and including most recent \n or \r */
9720 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9722 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009724 if (tabsize > 0) {
9725 incr = tabsize - (j % tabsize); /* cannot overflow */
9726 if (j > PY_SSIZE_T_MAX - incr)
9727 goto overflow1;
9728 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 if (j > PY_SSIZE_T_MAX - 1)
9733 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734 j++;
9735 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 if (i > PY_SSIZE_T_MAX - j)
9737 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009739 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740 }
9741 }
9742
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009743 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009744 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009745
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746 /* Second pass: create output string and fill it */
9747 u = _PyUnicode_New(i + j);
9748 if (!u)
9749 return NULL;
9750
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009751 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 q = _PyUnicode_WSTR(u); /* next output char */
9753 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009757 if (tabsize > 0) {
9758 i = tabsize - (j % tabsize);
9759 j += i;
9760 while (i--) {
9761 if (q >= qe)
9762 goto overflow2;
9763 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009764 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009765 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009766 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009767 else {
9768 if (q >= qe)
9769 goto overflow2;
9770 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009771 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772 if (*p == '\n' || *p == '\r')
9773 j = 0;
9774 }
9775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 if (PyUnicode_READY(u) == -1) {
9777 Py_DECREF(u);
9778 return NULL;
9779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009781
9782 overflow2:
9783 Py_DECREF(u);
9784 overflow1:
9785 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787}
9788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009789PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009790 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791\n\
9792Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009793such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794arguments start and end are interpreted as in slice notation.\n\
9795\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009796Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797
9798static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800{
Jesus Ceaac451502011-04-20 17:09:23 +02009801 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009802 Py_ssize_t start;
9803 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009804 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805
Jesus Ceaac451502011-04-20 17:09:23 +02009806 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9807 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 if (PyUnicode_READY(self) == -1)
9811 return NULL;
9812 if (PyUnicode_READY(substring) == -1)
9813 return NULL;
9814
9815 result = any_find_slice(
9816 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9817 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009818 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819
9820 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 if (result == -2)
9823 return NULL;
9824
Christian Heimes217cfd12007-12-02 14:31:20 +00009825 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826}
9827
9828static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009829unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009831 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
9832 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835}
9836
Guido van Rossumc2504932007-09-18 19:42:40 +00009837/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009838 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009839static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009840unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841{
Guido van Rossumc2504932007-09-18 19:42:40 +00009842 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009843 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 if (_PyUnicode_HASH(self) != -1)
9846 return _PyUnicode_HASH(self);
9847 if (PyUnicode_READY(self) == -1)
9848 return -1;
9849 len = PyUnicode_GET_LENGTH(self);
9850
9851 /* The hash function as a macro, gets expanded three times below. */
9852#define HASH(P) \
9853 x = (Py_uhash_t)*P << 7; \
9854 while (--len >= 0) \
9855 x = (1000003*x) ^ (Py_uhash_t)*P++;
9856
9857 switch (PyUnicode_KIND(self)) {
9858 case PyUnicode_1BYTE_KIND: {
9859 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9860 HASH(c);
9861 break;
9862 }
9863 case PyUnicode_2BYTE_KIND: {
9864 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9865 HASH(s);
9866 break;
9867 }
9868 default: {
9869 Py_UCS4 *l;
9870 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9871 "Impossible switch case in unicode_hash");
9872 l = PyUnicode_4BYTE_DATA(self);
9873 HASH(l);
9874 break;
9875 }
9876 }
9877 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9878
Guido van Rossumc2504932007-09-18 19:42:40 +00009879 if (x == -1)
9880 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009882 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009886PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009887 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009889Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890
9891static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009894 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009895 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009896 Py_ssize_t start;
9897 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898
Jesus Ceaac451502011-04-20 17:09:23 +02009899 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9900 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (PyUnicode_READY(self) == -1)
9904 return NULL;
9905 if (PyUnicode_READY(substring) == -1)
9906 return NULL;
9907
9908 result = any_find_slice(
9909 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9910 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009911 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912
9913 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 if (result == -2)
9916 return NULL;
9917
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 if (result < 0) {
9919 PyErr_SetString(PyExc_ValueError, "substring not found");
9920 return NULL;
9921 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009922
Christian Heimes217cfd12007-12-02 14:31:20 +00009923 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924}
9925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009926PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009929Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009930at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931
9932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009933unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 Py_ssize_t i, length;
9936 int kind;
9937 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938 int cased;
9939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 if (PyUnicode_READY(self) == -1)
9941 return NULL;
9942 length = PyUnicode_GET_LENGTH(self);
9943 kind = PyUnicode_KIND(self);
9944 data = PyUnicode_DATA(self);
9945
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 if (length == 1)
9948 return PyBool_FromLong(
9949 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009951 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009953 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009954
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 for (i = 0; i < length; i++) {
9957 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009958
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9960 return PyBool_FromLong(0);
9961 else if (!cased && Py_UNICODE_ISLOWER(ch))
9962 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009964 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965}
9966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009967PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009968 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009970Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009971at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972
9973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009974unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 Py_ssize_t i, length;
9977 int kind;
9978 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979 int cased;
9980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 if (PyUnicode_READY(self) == -1)
9982 return NULL;
9983 length = PyUnicode_GET_LENGTH(self);
9984 kind = PyUnicode_KIND(self);
9985 data = PyUnicode_DATA(self);
9986
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 if (length == 1)
9989 return PyBool_FromLong(
9990 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009992 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009994 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009995
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 for (i = 0; i < length; i++) {
9998 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009999
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10001 return PyBool_FromLong(0);
10002 else if (!cased && Py_UNICODE_ISUPPER(ch))
10003 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010005 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006}
10007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010008PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010009 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010011Return True if S is a titlecased string and there is at least one\n\
10012character in S, i.e. upper- and titlecase characters may only\n\
10013follow uncased characters and lowercase characters only cased ones.\n\
10014Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015
10016static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010017unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 Py_ssize_t i, length;
10020 int kind;
10021 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022 int cased, previous_is_cased;
10023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (PyUnicode_READY(self) == -1)
10025 return NULL;
10026 length = PyUnicode_GET_LENGTH(self);
10027 kind = PyUnicode_KIND(self);
10028 data = PyUnicode_DATA(self);
10029
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 if (length == 1) {
10032 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10033 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10034 (Py_UNICODE_ISUPPER(ch) != 0));
10035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010037 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010040
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041 cased = 0;
10042 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 for (i = 0; i < length; i++) {
10044 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010045
Benjamin Peterson29060642009-01-31 22:14:21 +000010046 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10047 if (previous_is_cased)
10048 return PyBool_FromLong(0);
10049 previous_is_cased = 1;
10050 cased = 1;
10051 }
10052 else if (Py_UNICODE_ISLOWER(ch)) {
10053 if (!previous_is_cased)
10054 return PyBool_FromLong(0);
10055 previous_is_cased = 1;
10056 cased = 1;
10057 }
10058 else
10059 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010061 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062}
10063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010064PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010067Return True if all characters in S are whitespace\n\
10068and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069
10070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010071unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 Py_ssize_t i, length;
10074 int kind;
10075 void *data;
10076
10077 if (PyUnicode_READY(self) == -1)
10078 return NULL;
10079 length = PyUnicode_GET_LENGTH(self);
10080 kind = PyUnicode_KIND(self);
10081 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 if (length == 1)
10085 return PyBool_FromLong(
10086 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010088 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010090 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 for (i = 0; i < length; i++) {
10093 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010094 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010095 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010097 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098}
10099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010100PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010102\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010103Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010104and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010105
10106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010107unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 Py_ssize_t i, length;
10110 int kind;
10111 void *data;
10112
10113 if (PyUnicode_READY(self) == -1)
10114 return NULL;
10115 length = PyUnicode_GET_LENGTH(self);
10116 kind = PyUnicode_KIND(self);
10117 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010118
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010119 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 if (length == 1)
10121 return PyBool_FromLong(
10122 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010123
10124 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010126 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 for (i = 0; i < length; i++) {
10129 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010130 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010131 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010132 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010133}
10134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010135PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010136 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010137\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010138Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010139and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010140
10141static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010142unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 int kind;
10145 void *data;
10146 Py_ssize_t len, i;
10147
10148 if (PyUnicode_READY(self) == -1)
10149 return NULL;
10150
10151 kind = PyUnicode_KIND(self);
10152 data = PyUnicode_DATA(self);
10153 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010154
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010155 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (len == 1) {
10157 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10158 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10159 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010160
10161 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010163 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 for (i = 0; i < len; i++) {
10166 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010167 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010168 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010169 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010170 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010171}
10172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010173PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010174 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010176Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010177False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178
10179static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010180unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 Py_ssize_t i, length;
10183 int kind;
10184 void *data;
10185
10186 if (PyUnicode_READY(self) == -1)
10187 return NULL;
10188 length = PyUnicode_GET_LENGTH(self);
10189 kind = PyUnicode_KIND(self);
10190 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 if (length == 1)
10194 return PyBool_FromLong(
10195 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010197 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010199 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 for (i = 0; i < length; i++) {
10202 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010205 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206}
10207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010208PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010209 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010211Return True if all characters in S are digits\n\
10212and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213
10214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010215unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 Py_ssize_t i, length;
10218 int kind;
10219 void *data;
10220
10221 if (PyUnicode_READY(self) == -1)
10222 return NULL;
10223 length = PyUnicode_GET_LENGTH(self);
10224 kind = PyUnicode_KIND(self);
10225 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 if (length == 1) {
10229 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10230 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010233 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010235 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 for (i = 0; i < length; i++) {
10238 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010239 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010241 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242}
10243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010244PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010245 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010247Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010248False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249
10250static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010251unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 Py_ssize_t i, length;
10254 int kind;
10255 void *data;
10256
10257 if (PyUnicode_READY(self) == -1)
10258 return NULL;
10259 length = PyUnicode_GET_LENGTH(self);
10260 kind = PyUnicode_KIND(self);
10261 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 if (length == 1)
10265 return PyBool_FromLong(
10266 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010268 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010270 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 for (i = 0; i < length; i++) {
10273 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010274 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010276 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277}
10278
Martin v. Löwis47383402007-08-15 07:32:56 +000010279int
10280PyUnicode_IsIdentifier(PyObject *self)
10281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 int kind;
10283 void *data;
10284 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010285 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 if (PyUnicode_READY(self) == -1) {
10288 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 }
10291
10292 /* Special case for empty strings */
10293 if (PyUnicode_GET_LENGTH(self) == 0)
10294 return 0;
10295 kind = PyUnicode_KIND(self);
10296 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010297
10298 /* PEP 3131 says that the first character must be in
10299 XID_Start and subsequent characters in XID_Continue,
10300 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010301 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010302 letters, digits, underscore). However, given the current
10303 definition of XID_Start and XID_Continue, it is sufficient
10304 to check just for these, except that _ must be allowed
10305 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010307 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010308 return 0;
10309
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010310 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010312 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010313 return 1;
10314}
10315
10316PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010317 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010318\n\
10319Return True if S is a valid identifier according\n\
10320to the language definition.");
10321
10322static PyObject*
10323unicode_isidentifier(PyObject *self)
10324{
10325 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10326}
10327
Georg Brandl559e5d72008-06-11 18:37:52 +000010328PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010329 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010330\n\
10331Return True if all characters in S are considered\n\
10332printable in repr() or S is empty, False otherwise.");
10333
10334static PyObject*
10335unicode_isprintable(PyObject *self)
10336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 Py_ssize_t i, length;
10338 int kind;
10339 void *data;
10340
10341 if (PyUnicode_READY(self) == -1)
10342 return NULL;
10343 length = PyUnicode_GET_LENGTH(self);
10344 kind = PyUnicode_KIND(self);
10345 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010346
10347 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 if (length == 1)
10349 return PyBool_FromLong(
10350 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 for (i = 0; i < length; i++) {
10353 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010354 Py_RETURN_FALSE;
10355 }
10356 }
10357 Py_RETURN_TRUE;
10358}
10359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010360PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010361 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362\n\
10363Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010364iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365
10366static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010367unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010369 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370}
10371
Martin v. Löwis18e16552006-02-15 17:27:45 +000010372static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373unicode_length(PyUnicodeObject *self)
10374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (PyUnicode_READY(self) == -1)
10376 return -1;
10377 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378}
10379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010380PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010381 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010383Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010384done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385
10386static PyObject *
10387unicode_ljust(PyUnicodeObject *self, PyObject *args)
10388{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010389 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 Py_UCS4 fillchar = ' ';
10391
10392 if (PyUnicode_READY(self) == -1)
10393 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010394
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010395 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396 return NULL;
10397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 Py_INCREF(self);
10400 return (PyObject*) self;
10401 }
10402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404}
10405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010406PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010409Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
10411static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010412unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414 return fixup(self, fixlower);
10415}
10416
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010417#define LEFTSTRIP 0
10418#define RIGHTSTRIP 1
10419#define BOTHSTRIP 2
10420
10421/* Arrays indexed by above */
10422static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10423
10424#define STRIPNAME(i) (stripformat[i]+3)
10425
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010426/* externally visible for str.strip(unicode) */
10427PyObject *
10428_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 void *data;
10431 int kind;
10432 Py_ssize_t i, j, len;
10433 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10436 return NULL;
10437
10438 kind = PyUnicode_KIND(self);
10439 data = PyUnicode_DATA(self);
10440 len = PyUnicode_GET_LENGTH(self);
10441 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10442 PyUnicode_DATA(sepobj),
10443 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010444
Benjamin Peterson14339b62009-01-31 16:36:08 +000010445 i = 0;
10446 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 while (i < len &&
10448 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010449 i++;
10450 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010451 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010452
Benjamin Peterson14339b62009-01-31 16:36:08 +000010453 j = len;
10454 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010455 do {
10456 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 } while (j >= i &&
10458 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010460 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010461
Victor Stinner12bab6d2011-10-01 01:53:49 +020010462 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463}
10464
10465PyObject*
10466PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10467{
10468 unsigned char *data;
10469 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010470 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471
Victor Stinnerde636f32011-10-01 03:55:54 +020010472 if (PyUnicode_READY(self) == -1)
10473 return NULL;
10474
10475 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10476
Victor Stinner12bab6d2011-10-01 01:53:49 +020010477 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010479 if (PyUnicode_CheckExact(self)) {
10480 Py_INCREF(self);
10481 return self;
10482 }
10483 else
10484 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 }
10486
Victor Stinner12bab6d2011-10-01 01:53:49 +020010487 length = end - start;
10488 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010489 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490
Victor Stinnerde636f32011-10-01 03:55:54 +020010491 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010492 PyErr_SetString(PyExc_IndexError, "string index out of range");
10493 return NULL;
10494 }
10495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 kind = PyUnicode_KIND(self);
10497 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010498 return PyUnicode_FromKindAndData(kind,
10499 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010500 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
10503static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010504do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 int kind;
10507 void *data;
10508 Py_ssize_t len, i, j;
10509
10510 if (PyUnicode_READY(self) == -1)
10511 return NULL;
10512
10513 kind = PyUnicode_KIND(self);
10514 data = PyUnicode_DATA(self);
10515 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010516
Benjamin Peterson14339b62009-01-31 16:36:08 +000010517 i = 0;
10518 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010520 i++;
10521 }
10522 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010523
Benjamin Peterson14339b62009-01-31 16:36:08 +000010524 j = len;
10525 if (striptype != LEFTSTRIP) {
10526 do {
10527 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010529 j++;
10530 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010531
Victor Stinner12bab6d2011-10-01 01:53:49 +020010532 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533}
10534
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010535
10536static PyObject *
10537do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10538{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010539 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10542 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010543
Benjamin Peterson14339b62009-01-31 16:36:08 +000010544 if (sep != NULL && sep != Py_None) {
10545 if (PyUnicode_Check(sep))
10546 return _PyUnicode_XStrip(self, striptype, sep);
10547 else {
10548 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010549 "%s arg must be None or str",
10550 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010551 return NULL;
10552 }
10553 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010554
Benjamin Peterson14339b62009-01-31 16:36:08 +000010555 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010556}
10557
10558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010559PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010560 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010561\n\
10562Return a copy of the string S with leading and trailing\n\
10563whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010564If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010565
10566static PyObject *
10567unicode_strip(PyUnicodeObject *self, PyObject *args)
10568{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 if (PyTuple_GET_SIZE(args) == 0)
10570 return do_strip(self, BOTHSTRIP); /* Common case */
10571 else
10572 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010573}
10574
10575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010576PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010578\n\
10579Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010580If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010581
10582static PyObject *
10583unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10584{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585 if (PyTuple_GET_SIZE(args) == 0)
10586 return do_strip(self, LEFTSTRIP); /* Common case */
10587 else
10588 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010589}
10590
10591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010592PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010593 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010594\n\
10595Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010596If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010597
10598static PyObject *
10599unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10600{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010601 if (PyTuple_GET_SIZE(args) == 0)
10602 return do_strip(self, RIGHTSTRIP); /* Common case */
10603 else
10604 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010605}
10606
10607
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010609unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610{
10611 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613
Georg Brandl222de0f2009-04-12 12:01:50 +000010614 if (len < 1) {
10615 Py_INCREF(unicode_empty);
10616 return (PyObject *)unicode_empty;
10617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618
Tim Peters7a29bd52001-09-12 03:03:31 +000010619 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 /* no repeat, return original string */
10621 Py_INCREF(str);
10622 return (PyObject*) str;
10623 }
Tim Peters8f422462000-09-09 06:13:41 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 if (PyUnicode_READY(str) == -1)
10626 return NULL;
10627
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010628 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010629 PyErr_SetString(PyExc_OverflowError,
10630 "repeated string is too long");
10631 return NULL;
10632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636 if (!u)
10637 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010638 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 if (PyUnicode_GET_LENGTH(str) == 1) {
10641 const int kind = PyUnicode_KIND(str);
10642 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10643 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010644 if (kind == PyUnicode_1BYTE_KIND)
10645 memset(to, (unsigned char)fill_char, len);
10646 else {
10647 for (n = 0; n < len; ++n)
10648 PyUnicode_WRITE(kind, to, n, fill_char);
10649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 }
10651 else {
10652 /* number of characters copied this far */
10653 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10654 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10655 char *to = (char *) PyUnicode_DATA(u);
10656 Py_MEMCPY(to, PyUnicode_DATA(str),
10657 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 n = (done <= nchars-done) ? done : nchars-done;
10660 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663 }
10664
10665 return (PyObject*) u;
10666}
10667
Alexander Belopolsky40018472011-02-26 01:02:56 +000010668PyObject *
10669PyUnicode_Replace(PyObject *obj,
10670 PyObject *subobj,
10671 PyObject *replobj,
10672 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673{
10674 PyObject *self;
10675 PyObject *str1;
10676 PyObject *str2;
10677 PyObject *result;
10678
10679 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010680 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010683 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010684 Py_DECREF(self);
10685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686 }
10687 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010688 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 Py_DECREF(self);
10690 Py_DECREF(str1);
10691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694 Py_DECREF(self);
10695 Py_DECREF(str1);
10696 Py_DECREF(str2);
10697 return result;
10698}
10699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010700PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010701 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702\n\
10703Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010704old replaced by new. If the optional argument count is\n\
10705given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
10707static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 PyObject *str1;
10711 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010712 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 PyObject *result;
10714
Martin v. Löwis18e16552006-02-15 17:27:45 +000010715 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 str1 = PyUnicode_FromObject(str1);
10720 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10721 return NULL;
10722 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010723 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010724 Py_DECREF(str1);
10725 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
10728 result = replace(self, str1, str2, maxcount);
10729
10730 Py_DECREF(str1);
10731 Py_DECREF(str2);
10732 return result;
10733}
10734
Alexander Belopolsky40018472011-02-26 01:02:56 +000010735static PyObject *
10736unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010738 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 Py_ssize_t isize;
10740 Py_ssize_t osize, squote, dquote, i, o;
10741 Py_UCS4 max, quote;
10742 int ikind, okind;
10743 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010746 return NULL;
10747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 isize = PyUnicode_GET_LENGTH(unicode);
10749 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 /* Compute length of output, quote characters, and
10752 maximum character */
10753 osize = 2; /* quotes */
10754 max = 127;
10755 squote = dquote = 0;
10756 ikind = PyUnicode_KIND(unicode);
10757 for (i = 0; i < isize; i++) {
10758 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10759 switch (ch) {
10760 case '\'': squote++; osize++; break;
10761 case '"': dquote++; osize++; break;
10762 case '\\': case '\t': case '\r': case '\n':
10763 osize += 2; break;
10764 default:
10765 /* Fast-path ASCII */
10766 if (ch < ' ' || ch == 0x7f)
10767 osize += 4; /* \xHH */
10768 else if (ch < 0x7f)
10769 osize++;
10770 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10771 osize++;
10772 max = ch > max ? ch : max;
10773 }
10774 else if (ch < 0x100)
10775 osize += 4; /* \xHH */
10776 else if (ch < 0x10000)
10777 osize += 6; /* \uHHHH */
10778 else
10779 osize += 10; /* \uHHHHHHHH */
10780 }
10781 }
10782
10783 quote = '\'';
10784 if (squote) {
10785 if (dquote)
10786 /* Both squote and dquote present. Use squote,
10787 and escape them */
10788 osize += squote;
10789 else
10790 quote = '"';
10791 }
10792
10793 repr = PyUnicode_New(osize, max);
10794 if (repr == NULL)
10795 return NULL;
10796 okind = PyUnicode_KIND(repr);
10797 odata = PyUnicode_DATA(repr);
10798
10799 PyUnicode_WRITE(okind, odata, 0, quote);
10800 PyUnicode_WRITE(okind, odata, osize-1, quote);
10801
10802 for (i = 0, o = 1; i < isize; i++) {
10803 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010804
10805 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 if ((ch == quote) || (ch == '\\')) {
10807 PyUnicode_WRITE(okind, odata, o++, '\\');
10808 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010809 continue;
10810 }
10811
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010813 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 PyUnicode_WRITE(okind, odata, o++, '\\');
10815 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010816 }
10817 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 PyUnicode_WRITE(okind, odata, o++, '\\');
10819 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010820 }
10821 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 PyUnicode_WRITE(okind, odata, o++, '\\');
10823 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010824 }
10825
10826 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010827 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 PyUnicode_WRITE(okind, odata, o++, '\\');
10829 PyUnicode_WRITE(okind, odata, o++, 'x');
10830 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10831 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010832 }
10833
Georg Brandl559e5d72008-06-11 18:37:52 +000010834 /* Copy ASCII characters as-is */
10835 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010837 }
10838
Benjamin Peterson29060642009-01-31 22:14:21 +000010839 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010840 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010841 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010842 (categories Z* and C* except ASCII space)
10843 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010845 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 if (ch <= 0xff) {
10847 PyUnicode_WRITE(okind, odata, o++, '\\');
10848 PyUnicode_WRITE(okind, odata, o++, 'x');
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10850 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010851 }
10852 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 else if (ch >= 0x10000) {
10854 PyUnicode_WRITE(okind, odata, o++, '\\');
10855 PyUnicode_WRITE(okind, odata, o++, 'U');
10856 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10857 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10858 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10859 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10860 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10861 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10862 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10863 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010864 }
10865 /* Map 16-bit characters to '\uxxxx' */
10866 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 PyUnicode_WRITE(okind, odata, o++, '\\');
10868 PyUnicode_WRITE(okind, odata, o++, 'u');
10869 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10870 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10871 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10872 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010873 }
10874 }
10875 /* Copy characters as-is */
10876 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010878 }
10879 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010882 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883}
10884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010885PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010886 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887\n\
10888Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010889such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890arguments start and end are interpreted as in slice notation.\n\
10891\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010892Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893
10894static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896{
Jesus Ceaac451502011-04-20 17:09:23 +020010897 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010898 Py_ssize_t start;
10899 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010900 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901
Jesus Ceaac451502011-04-20 17:09:23 +020010902 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10903 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 if (PyUnicode_READY(self) == -1)
10907 return NULL;
10908 if (PyUnicode_READY(substring) == -1)
10909 return NULL;
10910
10911 result = any_find_slice(
10912 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10913 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010914 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
10916 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 if (result == -2)
10919 return NULL;
10920
Christian Heimes217cfd12007-12-02 14:31:20 +000010921 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922}
10923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010924PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010925 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010927Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
10929static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931{
Jesus Ceaac451502011-04-20 17:09:23 +020010932 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010933 Py_ssize_t start;
10934 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010935 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936
Jesus Ceaac451502011-04-20 17:09:23 +020010937 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10938 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010939 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 if (PyUnicode_READY(self) == -1)
10942 return NULL;
10943 if (PyUnicode_READY(substring) == -1)
10944 return NULL;
10945
10946 result = any_find_slice(
10947 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10948 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010949 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950
10951 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 if (result == -2)
10954 return NULL;
10955
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 if (result < 0) {
10957 PyErr_SetString(PyExc_ValueError, "substring not found");
10958 return NULL;
10959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960
Christian Heimes217cfd12007-12-02 14:31:20 +000010961 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962}
10963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010964PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010965 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010967Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010968done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969
10970static PyObject *
10971unicode_rjust(PyUnicodeObject *self, PyObject *args)
10972{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010973 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 Py_UCS4 fillchar = ' ';
10975
Victor Stinnere9a29352011-10-01 02:14:59 +020010976 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010978
Victor Stinnere9a29352011-10-01 02:14:59 +020010979 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980 return NULL;
10981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983 Py_INCREF(self);
10984 return (PyObject*) self;
10985 }
10986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988}
10989
Alexander Belopolsky40018472011-02-26 01:02:56 +000010990PyObject *
10991PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992{
10993 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010994
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 s = PyUnicode_FromObject(s);
10996 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010997 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 if (sep != NULL) {
10999 sep = PyUnicode_FromObject(sep);
11000 if (sep == NULL) {
11001 Py_DECREF(s);
11002 return NULL;
11003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 }
11005
11006 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11007
11008 Py_DECREF(s);
11009 Py_XDECREF(sep);
11010 return result;
11011}
11012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011013PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015\n\
11016Return a list of the words in S, using sep as the\n\
11017delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011018splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011019whitespace string is a separator and empty strings are\n\
11020removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
11022static PyObject*
11023unicode_split(PyUnicodeObject *self, PyObject *args)
11024{
11025 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011026 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027
Martin v. Löwis18e16552006-02-15 17:27:45 +000011028 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 return NULL;
11030
11031 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037}
11038
Thomas Wouters477c8d52006-05-27 19:21:47 +000011039PyObject *
11040PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11041{
11042 PyObject* str_obj;
11043 PyObject* sep_obj;
11044 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 int kind1, kind2, kind;
11046 void *buf1 = NULL, *buf2 = NULL;
11047 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011048
11049 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011050 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011051 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011052 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011054 Py_DECREF(str_obj);
11055 return NULL;
11056 }
11057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058 kind1 = PyUnicode_KIND(str_in);
11059 kind2 = PyUnicode_KIND(sep_obj);
11060 kind = kind1 > kind2 ? kind1 : kind2;
11061 buf1 = PyUnicode_DATA(str_in);
11062 if (kind1 != kind)
11063 buf1 = _PyUnicode_AsKind(str_in, kind);
11064 if (!buf1)
11065 goto onError;
11066 buf2 = PyUnicode_DATA(sep_obj);
11067 if (kind2 != kind)
11068 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11069 if (!buf2)
11070 goto onError;
11071 len1 = PyUnicode_GET_LENGTH(str_obj);
11072 len2 = PyUnicode_GET_LENGTH(sep_obj);
11073
11074 switch(PyUnicode_KIND(str_in)) {
11075 case PyUnicode_1BYTE_KIND:
11076 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11077 break;
11078 case PyUnicode_2BYTE_KIND:
11079 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11080 break;
11081 case PyUnicode_4BYTE_KIND:
11082 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11083 break;
11084 default:
11085 assert(0);
11086 out = 0;
11087 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011088
11089 Py_DECREF(sep_obj);
11090 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (kind1 != kind)
11092 PyMem_Free(buf1);
11093 if (kind2 != kind)
11094 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011095
11096 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 onError:
11098 Py_DECREF(sep_obj);
11099 Py_DECREF(str_obj);
11100 if (kind1 != kind && buf1)
11101 PyMem_Free(buf1);
11102 if (kind2 != kind && buf2)
11103 PyMem_Free(buf2);
11104 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011105}
11106
11107
11108PyObject *
11109PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11110{
11111 PyObject* str_obj;
11112 PyObject* sep_obj;
11113 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 int kind1, kind2, kind;
11115 void *buf1 = NULL, *buf2 = NULL;
11116 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117
11118 str_obj = PyUnicode_FromObject(str_in);
11119 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011120 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011121 sep_obj = PyUnicode_FromObject(sep_in);
11122 if (!sep_obj) {
11123 Py_DECREF(str_obj);
11124 return NULL;
11125 }
11126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 kind1 = PyUnicode_KIND(str_in);
11128 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011129 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 buf1 = PyUnicode_DATA(str_in);
11131 if (kind1 != kind)
11132 buf1 = _PyUnicode_AsKind(str_in, kind);
11133 if (!buf1)
11134 goto onError;
11135 buf2 = PyUnicode_DATA(sep_obj);
11136 if (kind2 != kind)
11137 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11138 if (!buf2)
11139 goto onError;
11140 len1 = PyUnicode_GET_LENGTH(str_obj);
11141 len2 = PyUnicode_GET_LENGTH(sep_obj);
11142
11143 switch(PyUnicode_KIND(str_in)) {
11144 case PyUnicode_1BYTE_KIND:
11145 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11146 break;
11147 case PyUnicode_2BYTE_KIND:
11148 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11149 break;
11150 case PyUnicode_4BYTE_KIND:
11151 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11152 break;
11153 default:
11154 assert(0);
11155 out = 0;
11156 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011157
11158 Py_DECREF(sep_obj);
11159 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 if (kind1 != kind)
11161 PyMem_Free(buf1);
11162 if (kind2 != kind)
11163 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011164
11165 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 onError:
11167 Py_DECREF(sep_obj);
11168 Py_DECREF(str_obj);
11169 if (kind1 != kind && buf1)
11170 PyMem_Free(buf1);
11171 if (kind2 != kind && buf2)
11172 PyMem_Free(buf2);
11173 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011174}
11175
11176PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011177 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011179Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011180the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011181found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182
11183static PyObject*
11184unicode_partition(PyUnicodeObject *self, PyObject *separator)
11185{
11186 return PyUnicode_Partition((PyObject *)self, separator);
11187}
11188
11189PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011190 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011191\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011192Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011193the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011194separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011195
11196static PyObject*
11197unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11198{
11199 return PyUnicode_RPartition((PyObject *)self, separator);
11200}
11201
Alexander Belopolsky40018472011-02-26 01:02:56 +000011202PyObject *
11203PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011204{
11205 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011206
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011207 s = PyUnicode_FromObject(s);
11208 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011209 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 if (sep != NULL) {
11211 sep = PyUnicode_FromObject(sep);
11212 if (sep == NULL) {
11213 Py_DECREF(s);
11214 return NULL;
11215 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011216 }
11217
11218 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11219
11220 Py_DECREF(s);
11221 Py_XDECREF(sep);
11222 return result;
11223}
11224
11225PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011227\n\
11228Return a list of the words in S, using sep as the\n\
11229delimiter string, starting at the end of the string and\n\
11230working to the front. If maxsplit is given, at most maxsplit\n\
11231splits are done. If sep is not specified, any whitespace string\n\
11232is a separator.");
11233
11234static PyObject*
11235unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11236{
11237 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011238 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011239
Martin v. Löwis18e16552006-02-15 17:27:45 +000011240 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011241 return NULL;
11242
11243 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011245 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011247 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011249}
11250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011251PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253\n\
11254Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011255Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011256is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
11258static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011259unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011261 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011262 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011264 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11265 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 return NULL;
11267
Guido van Rossum86662912000-04-11 15:38:46 +000011268 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269}
11270
11271static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011272PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273{
Walter Dörwald346737f2007-05-31 10:44:43 +000011274 if (PyUnicode_CheckExact(self)) {
11275 Py_INCREF(self);
11276 return self;
11277 } else
11278 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011279 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280}
11281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284\n\
11285Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011286and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
11288static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011289unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 return fixup(self, fixswapcase);
11292}
11293
Georg Brandlceee0772007-11-27 23:48:05 +000011294PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011296\n\
11297Return a translation table usable for str.translate().\n\
11298If there is only one argument, it must be a dictionary mapping Unicode\n\
11299ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011300Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011301If there are two arguments, they must be strings of equal length, and\n\
11302in the resulting dictionary, each character in x will be mapped to the\n\
11303character at the same position in y. If there is a third argument, it\n\
11304must be a string, whose characters will be mapped to None in the result.");
11305
11306static PyObject*
11307unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11308{
11309 PyObject *x, *y = NULL, *z = NULL;
11310 PyObject *new = NULL, *key, *value;
11311 Py_ssize_t i = 0;
11312 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011313
Georg Brandlceee0772007-11-27 23:48:05 +000011314 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11315 return NULL;
11316 new = PyDict_New();
11317 if (!new)
11318 return NULL;
11319 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 int x_kind, y_kind, z_kind;
11321 void *x_data, *y_data, *z_data;
11322
Georg Brandlceee0772007-11-27 23:48:05 +000011323 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011324 if (!PyUnicode_Check(x)) {
11325 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11326 "be a string if there is a second argument");
11327 goto err;
11328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011330 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11331 "arguments must have equal length");
11332 goto err;
11333 }
11334 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 x_kind = PyUnicode_KIND(x);
11336 y_kind = PyUnicode_KIND(y);
11337 x_data = PyUnicode_DATA(x);
11338 y_data = PyUnicode_DATA(y);
11339 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11340 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11341 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011342 if (!key || !value)
11343 goto err;
11344 res = PyDict_SetItem(new, key, value);
11345 Py_DECREF(key);
11346 Py_DECREF(value);
11347 if (res < 0)
11348 goto err;
11349 }
11350 /* create entries for deleting chars in z */
11351 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 z_kind = PyUnicode_KIND(z);
11353 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011354 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011356 if (!key)
11357 goto err;
11358 res = PyDict_SetItem(new, key, Py_None);
11359 Py_DECREF(key);
11360 if (res < 0)
11361 goto err;
11362 }
11363 }
11364 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 int kind;
11366 void *data;
11367
Georg Brandlceee0772007-11-27 23:48:05 +000011368 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011369 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011370 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11371 "to maketrans it must be a dict");
11372 goto err;
11373 }
11374 /* copy entries into the new dict, converting string keys to int keys */
11375 while (PyDict_Next(x, &i, &key, &value)) {
11376 if (PyUnicode_Check(key)) {
11377 /* convert string keys to integer keys */
11378 PyObject *newkey;
11379 if (PyUnicode_GET_SIZE(key) != 1) {
11380 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11381 "table must be of length 1");
11382 goto err;
11383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 kind = PyUnicode_KIND(key);
11385 data = PyUnicode_DATA(key);
11386 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011387 if (!newkey)
11388 goto err;
11389 res = PyDict_SetItem(new, newkey, value);
11390 Py_DECREF(newkey);
11391 if (res < 0)
11392 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011393 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011394 /* just keep integer keys */
11395 if (PyDict_SetItem(new, key, value) < 0)
11396 goto err;
11397 } else {
11398 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11399 "be strings or integers");
11400 goto err;
11401 }
11402 }
11403 }
11404 return new;
11405 err:
11406 Py_DECREF(new);
11407 return NULL;
11408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412\n\
11413Return a copy of the string S, where all characters have been mapped\n\
11414through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011415Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011416Unmapped characters are left untouched. Characters mapped to None\n\
11417are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
11419static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423}
11424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011425PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011428Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
11430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011431unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 return fixup(self, fixupper);
11434}
11435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011436PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011439Pad a numeric string S with zeros on the left, to fill a field\n\
11440of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject *
11443unicode_zfill(PyUnicodeObject *self, PyObject *args)
11444{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011445 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011447 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 int kind;
11449 void *data;
11450 Py_UCS4 chr;
11451
11452 if (PyUnicode_READY(self) == -1)
11453 return NULL;
11454
Martin v. Löwis18e16552006-02-15 17:27:45 +000011455 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 return NULL;
11457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011459 if (PyUnicode_CheckExact(self)) {
11460 Py_INCREF(self);
11461 return (PyObject*) self;
11462 }
11463 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011464 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465 }
11466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
11469 u = pad(self, fill, 0, '0');
11470
Walter Dörwald068325e2002-04-15 13:36:47 +000011471 if (u == NULL)
11472 return NULL;
11473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 kind = PyUnicode_KIND(u);
11475 data = PyUnicode_DATA(u);
11476 chr = PyUnicode_READ(kind, data, fill);
11477
11478 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 PyUnicode_WRITE(kind, data, 0, chr);
11481 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482 }
11483
11484 return (PyObject*) u;
11485}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
11487#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011488static PyObject *
11489unicode__decimal2ascii(PyObject *self)
11490{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011492}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493#endif
11494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011495PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011498Return True if S starts with the specified prefix, False otherwise.\n\
11499With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011500With optional end, stop comparing S at that position.\n\
11501prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
11503static PyObject *
11504unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011507 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011509 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011510 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011511 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512
Jesus Ceaac451502011-04-20 17:09:23 +020011513 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011515 if (PyTuple_Check(subobj)) {
11516 Py_ssize_t i;
11517 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11518 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011520 if (substring == NULL)
11521 return NULL;
11522 result = tailmatch(self, substring, start, end, -1);
11523 Py_DECREF(substring);
11524 if (result) {
11525 Py_RETURN_TRUE;
11526 }
11527 }
11528 /* nothing matched */
11529 Py_RETURN_FALSE;
11530 }
11531 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011532 if (substring == NULL) {
11533 if (PyErr_ExceptionMatches(PyExc_TypeError))
11534 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11535 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011537 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011538 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011540 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541}
11542
11543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011544PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011547Return True if S ends with the specified suffix, False otherwise.\n\
11548With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011549With optional end, stop comparing S at that position.\n\
11550suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
11552static PyObject *
11553unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011556 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011558 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011559 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011560 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561
Jesus Ceaac451502011-04-20 17:09:23 +020011562 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011564 if (PyTuple_Check(subobj)) {
11565 Py_ssize_t i;
11566 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11567 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011569 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011571 result = tailmatch(self, substring, start, end, +1);
11572 Py_DECREF(substring);
11573 if (result) {
11574 Py_RETURN_TRUE;
11575 }
11576 }
11577 Py_RETURN_FALSE;
11578 }
11579 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011580 if (substring == NULL) {
11581 if (PyErr_ExceptionMatches(PyExc_TypeError))
11582 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11583 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011585 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011586 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011588 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589}
11590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011592
11593PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011595\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011596Return a formatted version of S, using substitutions from args and kwargs.\n\
11597The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011598
Eric Smith27bbca62010-11-04 17:06:58 +000011599PyDoc_STRVAR(format_map__doc__,
11600 "S.format_map(mapping) -> str\n\
11601\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011602Return a formatted version of S, using substitutions from mapping.\n\
11603The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011604
Eric Smith4a7d76d2008-05-30 18:10:19 +000011605static PyObject *
11606unicode__format__(PyObject* self, PyObject* args)
11607{
11608 PyObject *format_spec;
11609
11610 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11611 return NULL;
11612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11614 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011615}
11616
Eric Smith8c663262007-08-25 02:26:07 +000011617PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011619\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011620Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011621
11622static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011623unicode__sizeof__(PyUnicodeObject *v)
11624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 Py_ssize_t size;
11626
11627 /* If it's a compact object, account for base structure +
11628 character data. */
11629 if (PyUnicode_IS_COMPACT_ASCII(v))
11630 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11631 else if (PyUnicode_IS_COMPACT(v))
11632 size = sizeof(PyCompactUnicodeObject) +
11633 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11634 else {
11635 /* If it is a two-block object, account for base object, and
11636 for character block if present. */
11637 size = sizeof(PyUnicodeObject);
11638 if (v->data.any)
11639 size += (PyUnicode_GET_LENGTH(v) + 1) *
11640 PyUnicode_CHARACTER_SIZE(v);
11641 }
11642 /* If the wstr pointer is present, account for it unless it is shared
11643 with the data pointer. Since PyUnicode_DATA will crash if the object
11644 is not ready, check whether it's either not ready (in which case the
11645 data is entirely in wstr) or if the data is not shared. */
11646 if (_PyUnicode_WSTR(v) &&
11647 (!PyUnicode_IS_READY(v) ||
11648 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11649 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011650 if (!PyUnicode_IS_COMPACT_ASCII(v)
11651 && _PyUnicode_UTF8(v)
11652 && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11653 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654
11655 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011656}
11657
11658PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011660
11661static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011662unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011663{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011664 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 if (!copy)
11666 return NULL;
11667 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011668}
11669
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670static PyMethodDef unicode_methods[] = {
11671
11672 /* Order is according to common usage: often used methods should
11673 appear first, since lookup is done sequentially. */
11674
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011675 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011676 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11677 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011678 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011679 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11680 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11681 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11682 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11683 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11684 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11685 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011686 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011687 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11688 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11689 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011691 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11692 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11693 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011695 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011696 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011698 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11699 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11700 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11701 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11702 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11703 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11704 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11705 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11706 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11707 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11708 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11709 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11710 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11711 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011712 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011713 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011714 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011715 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011716 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011717 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011718 {"maketrans", (PyCFunction) unicode_maketrans,
11719 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011720 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011721#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011722 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723#endif
11724
11725#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011726 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011727 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728#endif
11729
Benjamin Peterson14339b62009-01-31 16:36:08 +000011730 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 {NULL, NULL}
11732};
11733
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011734static PyObject *
11735unicode_mod(PyObject *v, PyObject *w)
11736{
Brian Curtindfc80e32011-08-10 20:28:54 -050011737 if (!PyUnicode_Check(v))
11738 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011740}
11741
11742static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011743 0, /*nb_add*/
11744 0, /*nb_subtract*/
11745 0, /*nb_multiply*/
11746 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011747};
11748
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011750 (lenfunc) unicode_length, /* sq_length */
11751 PyUnicode_Concat, /* sq_concat */
11752 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11753 (ssizeargfunc) unicode_getitem, /* sq_item */
11754 0, /* sq_slice */
11755 0, /* sq_ass_item */
11756 0, /* sq_ass_slice */
11757 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758};
11759
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011760static PyObject*
11761unicode_subscript(PyUnicodeObject* self, PyObject* item)
11762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (PyUnicode_READY(self) == -1)
11764 return NULL;
11765
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011766 if (PyIndex_Check(item)) {
11767 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011768 if (i == -1 && PyErr_Occurred())
11769 return NULL;
11770 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011772 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011773 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011774 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011776 Py_UNICODE* result_buf;
11777 PyObject* result;
11778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011781 return NULL;
11782 }
11783
11784 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 return PyUnicode_New(0, 0);
11786 } else if (start == 0 && step == 1 &&
11787 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011788 PyUnicode_CheckExact(self)) {
11789 Py_INCREF(self);
11790 return (PyObject *)self;
11791 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011792 return PyUnicode_Substring((PyObject*)self,
11793 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011794 } else {
11795 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011796 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11797 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011798
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 if (result_buf == NULL)
11800 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011801
11802 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11803 result_buf[i] = source_buf[cur];
11804 }
Tim Petersced69f82003-09-16 20:30:58 +000011805
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011806 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011807 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011808 return result;
11809 }
11810 } else {
11811 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11812 return NULL;
11813 }
11814}
11815
11816static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011817 (lenfunc)unicode_length, /* mp_length */
11818 (binaryfunc)unicode_subscript, /* mp_subscript */
11819 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011820};
11821
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823/* Helpers for PyUnicode_Format() */
11824
11825static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011826getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011828 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 (*p_argidx)++;
11831 if (arglen < 0)
11832 return args;
11833 else
11834 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 }
11836 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838 return NULL;
11839}
11840
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011841/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011843static PyObject *
11844formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011846 char *p;
11847 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011849
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850 x = PyFloat_AsDouble(v);
11851 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011852 return NULL;
11853
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011855 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011856
Eric Smith0923d1d2009-04-16 20:16:10 +000011857 p = PyOS_double_to_string(x, type, prec,
11858 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011859 if (p == NULL)
11860 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011862 PyMem_Free(p);
11863 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864}
11865
Tim Peters38fd5b62000-09-21 05:43:11 +000011866static PyObject*
11867formatlong(PyObject *val, int flags, int prec, int type)
11868{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011869 char *buf;
11870 int len;
11871 PyObject *str; /* temporary string object. */
11872 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011873
Benjamin Peterson14339b62009-01-31 16:36:08 +000011874 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11875 if (!str)
11876 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011878 Py_DECREF(str);
11879 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011880}
11881
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011884 size_t buflen,
11885 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011887 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011888 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 if (PyUnicode_GET_LENGTH(v) == 1) {
11890 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 buf[1] = '\0';
11892 return 1;
11893 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011894 goto onError;
11895 }
11896 else {
11897 /* Integer input truncated to a character */
11898 long x;
11899 x = PyLong_AsLong(v);
11900 if (x == -1 && PyErr_Occurred())
11901 goto onError;
11902
11903 if (x < 0 || x > 0x10ffff) {
11904 PyErr_SetString(PyExc_OverflowError,
11905 "%c arg not in range(0x110000)");
11906 return -1;
11907 }
11908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011910 buf[1] = '\0';
11911 return 1;
11912 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011913
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011915 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011917 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918}
11919
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011920/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011921 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011922*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011923#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011924
Alexander Belopolsky40018472011-02-26 01:02:56 +000011925PyObject *
11926PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 void *fmt;
11929 int fmtkind;
11930 PyObject *result;
11931 Py_UCS4 *res, *res0;
11932 Py_UCS4 max;
11933 int kind;
11934 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011938
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 PyErr_BadInternalCall();
11941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11944 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 fmt = PyUnicode_DATA(uformat);
11947 fmtkind = PyUnicode_KIND(uformat);
11948 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11949 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950
11951 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11953 if (res0 == NULL) {
11954 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957
11958 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 arglen = PyTuple_Size(args);
11960 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961 }
11962 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 arglen = -1;
11964 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011966 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011967 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
11970 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 if (--rescnt < 0) {
11973 rescnt = fmtcnt + 100;
11974 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11976 if (res0 == NULL){
11977 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 }
11980 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011984 }
11985 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 /* Got a format specifier */
11987 int flags = 0;
11988 Py_ssize_t width = -1;
11989 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 Py_UCS4 c = '\0';
11991 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 int isnumok;
11993 PyObject *v = NULL;
11994 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 void *pbuf;
11996 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 Py_ssize_t len, len1;
11999 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 fmtpos++;
12002 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12003 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 Py_ssize_t keylen;
12005 PyObject *key;
12006 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012007
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 if (dict == NULL) {
12009 PyErr_SetString(PyExc_TypeError,
12010 "format requires a mapping");
12011 goto onError;
12012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 /* Skip over balanced parentheses */
12017 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 if (fmtcnt < 0 || pcount > 0) {
12026 PyErr_SetString(PyExc_ValueError,
12027 "incomplete format key");
12028 goto onError;
12029 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012030 key = PyUnicode_Substring((PyObject*)uformat,
12031 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 if (key == NULL)
12033 goto onError;
12034 if (args_owned) {
12035 Py_DECREF(args);
12036 args_owned = 0;
12037 }
12038 args = PyObject_GetItem(dict, key);
12039 Py_DECREF(key);
12040 if (args == NULL) {
12041 goto onError;
12042 }
12043 args_owned = 1;
12044 arglen = -1;
12045 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012046 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012049 case '-': flags |= F_LJUST; continue;
12050 case '+': flags |= F_SIGN; continue;
12051 case ' ': flags |= F_BLANK; continue;
12052 case '#': flags |= F_ALT; continue;
12053 case '0': flags |= F_ZERO; continue;
12054 }
12055 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012056 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 if (c == '*') {
12058 v = getnextarg(args, arglen, &argidx);
12059 if (v == NULL)
12060 goto onError;
12061 if (!PyLong_Check(v)) {
12062 PyErr_SetString(PyExc_TypeError,
12063 "* wants int");
12064 goto onError;
12065 }
12066 width = PyLong_AsLong(v);
12067 if (width == -1 && PyErr_Occurred())
12068 goto onError;
12069 if (width < 0) {
12070 flags |= F_LJUST;
12071 width = -width;
12072 }
12073 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 }
12076 else if (c >= '0' && c <= '9') {
12077 width = c - '0';
12078 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 if (c < '0' || c > '9')
12081 break;
12082 if ((width*10) / 10 != width) {
12083 PyErr_SetString(PyExc_ValueError,
12084 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012085 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012086 }
12087 width = width*10 + (c - '0');
12088 }
12089 }
12090 if (c == '.') {
12091 prec = 0;
12092 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 if (c == '*') {
12095 v = getnextarg(args, arglen, &argidx);
12096 if (v == NULL)
12097 goto onError;
12098 if (!PyLong_Check(v)) {
12099 PyErr_SetString(PyExc_TypeError,
12100 "* wants int");
12101 goto onError;
12102 }
12103 prec = PyLong_AsLong(v);
12104 if (prec == -1 && PyErr_Occurred())
12105 goto onError;
12106 if (prec < 0)
12107 prec = 0;
12108 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 }
12111 else if (c >= '0' && c <= '9') {
12112 prec = c - '0';
12113 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 if (c < '0' || c > '9')
12116 break;
12117 if ((prec*10) / 10 != prec) {
12118 PyErr_SetString(PyExc_ValueError,
12119 "prec too big");
12120 goto onError;
12121 }
12122 prec = prec*10 + (c - '0');
12123 }
12124 }
12125 } /* prec */
12126 if (fmtcnt >= 0) {
12127 if (c == 'h' || c == 'l' || c == 'L') {
12128 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 }
12131 }
12132 if (fmtcnt < 0) {
12133 PyErr_SetString(PyExc_ValueError,
12134 "incomplete format");
12135 goto onError;
12136 }
12137 if (c != '%') {
12138 v = getnextarg(args, arglen, &argidx);
12139 if (v == NULL)
12140 goto onError;
12141 }
12142 sign = 0;
12143 fill = ' ';
12144 switch (c) {
12145
12146 case '%':
12147 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 len = 1;
12152 break;
12153
12154 case 's':
12155 case 'r':
12156 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012157 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 temp = v;
12159 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012160 }
12161 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 if (c == 's')
12163 temp = PyObject_Str(v);
12164 else if (c == 'r')
12165 temp = PyObject_Repr(v);
12166 else
12167 temp = PyObject_ASCII(v);
12168 if (temp == NULL)
12169 goto onError;
12170 if (PyUnicode_Check(temp))
12171 /* nothing to do */;
12172 else {
12173 Py_DECREF(temp);
12174 PyErr_SetString(PyExc_TypeError,
12175 "%s argument has non-string str()");
12176 goto onError;
12177 }
12178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (PyUnicode_READY(temp) == -1) {
12180 Py_CLEAR(temp);
12181 goto onError;
12182 }
12183 pbuf = PyUnicode_DATA(temp);
12184 kind = PyUnicode_KIND(temp);
12185 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012186 if (prec >= 0 && len > prec)
12187 len = prec;
12188 break;
12189
12190 case 'i':
12191 case 'd':
12192 case 'u':
12193 case 'o':
12194 case 'x':
12195 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012196 isnumok = 0;
12197 if (PyNumber_Check(v)) {
12198 PyObject *iobj=NULL;
12199
12200 if (PyLong_Check(v)) {
12201 iobj = v;
12202 Py_INCREF(iobj);
12203 }
12204 else {
12205 iobj = PyNumber_Long(v);
12206 }
12207 if (iobj!=NULL) {
12208 if (PyLong_Check(iobj)) {
12209 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012210 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012211 Py_DECREF(iobj);
12212 if (!temp)
12213 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 if (PyUnicode_READY(temp) == -1) {
12215 Py_CLEAR(temp);
12216 goto onError;
12217 }
12218 pbuf = PyUnicode_DATA(temp);
12219 kind = PyUnicode_KIND(temp);
12220 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 sign = 1;
12222 }
12223 else {
12224 Py_DECREF(iobj);
12225 }
12226 }
12227 }
12228 if (!isnumok) {
12229 PyErr_Format(PyExc_TypeError,
12230 "%%%c format: a number is required, "
12231 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12232 goto onError;
12233 }
12234 if (flags & F_ZERO)
12235 fill = '0';
12236 break;
12237
12238 case 'e':
12239 case 'E':
12240 case 'f':
12241 case 'F':
12242 case 'g':
12243 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012244 temp = formatfloat(v, flags, prec, c);
12245 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 if (PyUnicode_READY(temp) == -1) {
12248 Py_CLEAR(temp);
12249 goto onError;
12250 }
12251 pbuf = PyUnicode_DATA(temp);
12252 kind = PyUnicode_KIND(temp);
12253 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 sign = 1;
12255 if (flags & F_ZERO)
12256 fill = '0';
12257 break;
12258
12259 case 'c':
12260 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012262 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 if (len < 0)
12264 goto onError;
12265 break;
12266
12267 default:
12268 PyErr_Format(PyExc_ValueError,
12269 "unsupported format character '%c' (0x%x) "
12270 "at index %zd",
12271 (31<=c && c<=126) ? (char)c : '?',
12272 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 goto onError;
12275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 /* pbuf is initialized here. */
12277 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12280 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12281 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 len--;
12283 }
12284 else if (flags & F_SIGN)
12285 sign = '+';
12286 else if (flags & F_BLANK)
12287 sign = ' ';
12288 else
12289 sign = 0;
12290 }
12291 if (width < len)
12292 width = len;
12293 if (rescnt - (sign != 0) < width) {
12294 reslen -= rescnt;
12295 rescnt = width + fmtcnt + 100;
12296 reslen += rescnt;
12297 if (reslen < 0) {
12298 Py_XDECREF(temp);
12299 PyErr_NoMemory();
12300 goto onError;
12301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12303 if (res0 == 0) {
12304 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 Py_XDECREF(temp);
12306 goto onError;
12307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 }
12310 if (sign) {
12311 if (fill != ' ')
12312 *res++ = sign;
12313 rescnt--;
12314 if (width > len)
12315 width--;
12316 }
12317 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12319 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12322 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 }
12324 rescnt -= 2;
12325 width -= 2;
12326 if (width < 0)
12327 width = 0;
12328 len -= 2;
12329 }
12330 if (width > len && !(flags & F_LJUST)) {
12331 do {
12332 --rescnt;
12333 *res++ = fill;
12334 } while (--width > len);
12335 }
12336 if (fill == ' ') {
12337 if (sign)
12338 *res++ = sign;
12339 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12341 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12342 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12343 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012344 }
12345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 /* Copy all characters, preserving len */
12347 len1 = len;
12348 while (len1--) {
12349 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12350 rescnt--;
12351 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 while (--width >= len) {
12353 --rescnt;
12354 *res++ = ' ';
12355 }
12356 if (dict && (argidx < arglen) && c != '%') {
12357 PyErr_SetString(PyExc_TypeError,
12358 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012359 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 goto onError;
12361 }
12362 Py_XDECREF(temp);
12363 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 } /* until end */
12365 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012366 PyErr_SetString(PyExc_TypeError,
12367 "not all arguments converted during string formatting");
12368 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 }
12370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371
12372 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12373 if (*res > max)
12374 max = *res;
12375 result = PyUnicode_New(reslen - rescnt, max);
12376 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 kind = PyUnicode_KIND(result);
12379 for (res = res0; res < res0+reslen-rescnt; res++)
12380 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12381 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 }
12385 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 return (PyObject *)result;
12387
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390 Py_DECREF(uformat);
12391 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012392 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393 }
12394 return NULL;
12395}
12396
Jeremy Hylton938ace62002-07-17 16:30:39 +000012397static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012398unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12399
Tim Peters6d6c1a32001-08-02 04:15:00 +000012400static PyObject *
12401unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12402{
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 static char *kwlist[] = {"object", "encoding", "errors", 0};
12405 char *encoding = NULL;
12406 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012407
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 if (type != &PyUnicode_Type)
12409 return unicode_subtype_new(type, args, kwds);
12410 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012412 return NULL;
12413 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 if (encoding == NULL && errors == NULL)
12416 return PyObject_Str(x);
12417 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012419}
12420
Guido van Rossume023fe02001-08-30 03:12:59 +000012421static PyObject *
12422unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12423{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012424 PyUnicodeObject *unicode, *self;
12425 Py_ssize_t length, char_size;
12426 int share_wstr, share_utf8;
12427 unsigned int kind;
12428 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012429
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012431
12432 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12433 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012434 return NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012435 assert(PyUnicode_Check(unicode));
12436 if (PyUnicode_READY(unicode))
12437 return NULL;
12438
12439 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12440 if (self == NULL) {
12441 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 return NULL;
12443 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012444 kind = PyUnicode_KIND(unicode);
12445 length = PyUnicode_GET_LENGTH(unicode);
12446
12447 _PyUnicode_LENGTH(self) = length;
12448 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12449 _PyUnicode_STATE(self).interned = 0;
12450 _PyUnicode_STATE(self).kind = kind;
12451 _PyUnicode_STATE(self).compact = 0;
12452 _PyUnicode_STATE(self).ascii = 0;
12453 _PyUnicode_STATE(self).ready = 1;
12454 _PyUnicode_WSTR(self) = NULL;
12455 _PyUnicode_UTF8_LENGTH(self) = 0;
12456 _PyUnicode_UTF8(self) = NULL;
12457 _PyUnicode_WSTR_LENGTH(self) = 0;
12458 self->data.any = NULL;
12459
12460 share_utf8 = 0;
12461 share_wstr = 0;
12462 if (kind == PyUnicode_1BYTE_KIND) {
12463 char_size = 1;
12464 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12465 share_utf8 = 1;
12466 }
12467 else if (kind == PyUnicode_2BYTE_KIND) {
12468 char_size = 2;
12469 if (sizeof(wchar_t) == 2)
12470 share_wstr = 1;
12471 }
12472 else {
12473 assert(kind == PyUnicode_4BYTE_KIND);
12474 char_size = 4;
12475 if (sizeof(wchar_t) == 4)
12476 share_wstr = 1;
12477 }
12478
12479 /* Ensure we won't overflow the length. */
12480 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12481 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012483 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012484 data = PyObject_MALLOC((length + 1) * char_size);
12485 if (data == NULL) {
12486 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 goto onError;
12488 }
12489
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012490 self->data.any = data;
12491 if (share_utf8) {
12492 _PyUnicode_UTF8_LENGTH(self) = length;
12493 _PyUnicode_UTF8(self) = data;
12494 }
12495 if (share_wstr) {
12496 _PyUnicode_WSTR_LENGTH(self) = length;
12497 _PyUnicode_WSTR(self) = (wchar_t *)data;
12498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012500 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12501 PyUnicode_KIND_SIZE(kind, length + 1));
12502 Py_DECREF(unicode);
12503 return (PyObject *)self;
12504
12505onError:
12506 Py_DECREF(unicode);
12507 Py_DECREF(self);
12508 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012509}
12510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012511PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012513\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012514Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012515encoding defaults to the current default string encoding.\n\
12516errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012517
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012518static PyObject *unicode_iter(PyObject *seq);
12519
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012521 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012522 "str", /* tp_name */
12523 sizeof(PyUnicodeObject), /* tp_size */
12524 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012526 (destructor)unicode_dealloc, /* tp_dealloc */
12527 0, /* tp_print */
12528 0, /* tp_getattr */
12529 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012530 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012531 unicode_repr, /* tp_repr */
12532 &unicode_as_number, /* tp_as_number */
12533 &unicode_as_sequence, /* tp_as_sequence */
12534 &unicode_as_mapping, /* tp_as_mapping */
12535 (hashfunc) unicode_hash, /* tp_hash*/
12536 0, /* tp_call*/
12537 (reprfunc) unicode_str, /* tp_str */
12538 PyObject_GenericGetAttr, /* tp_getattro */
12539 0, /* tp_setattro */
12540 0, /* tp_as_buffer */
12541 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012542 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012543 unicode_doc, /* tp_doc */
12544 0, /* tp_traverse */
12545 0, /* tp_clear */
12546 PyUnicode_RichCompare, /* tp_richcompare */
12547 0, /* tp_weaklistoffset */
12548 unicode_iter, /* tp_iter */
12549 0, /* tp_iternext */
12550 unicode_methods, /* tp_methods */
12551 0, /* tp_members */
12552 0, /* tp_getset */
12553 &PyBaseObject_Type, /* tp_base */
12554 0, /* tp_dict */
12555 0, /* tp_descr_get */
12556 0, /* tp_descr_set */
12557 0, /* tp_dictoffset */
12558 0, /* tp_init */
12559 0, /* tp_alloc */
12560 unicode_new, /* tp_new */
12561 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562};
12563
12564/* Initialize the Unicode implementation */
12565
Thomas Wouters78890102000-07-22 19:25:51 +000012566void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012568 int i;
12569
Thomas Wouters477c8d52006-05-27 19:21:47 +000012570 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012572 0x000A, /* LINE FEED */
12573 0x000D, /* CARRIAGE RETURN */
12574 0x001C, /* FILE SEPARATOR */
12575 0x001D, /* GROUP SEPARATOR */
12576 0x001E, /* RECORD SEPARATOR */
12577 0x0085, /* NEXT LINE */
12578 0x2028, /* LINE SEPARATOR */
12579 0x2029, /* PARAGRAPH SEPARATOR */
12580 };
12581
Fred Drakee4315f52000-05-09 19:53:39 +000012582 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012584 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012586
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012587 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012589 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012591
12592 /* initialize the linebreak bloom filter */
12593 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012595 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012596
12597 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598}
12599
12600/* Finalize the Unicode implementation */
12601
Christian Heimesa156e092008-02-16 07:38:31 +000012602int
12603PyUnicode_ClearFreeList(void)
12604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012606}
12607
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608void
Thomas Wouters78890102000-07-22 19:25:51 +000012609_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012611 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012613 Py_XDECREF(unicode_empty);
12614 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012615
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012616 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 if (unicode_latin1[i]) {
12618 Py_DECREF(unicode_latin1[i]);
12619 unicode_latin1[i] = NULL;
12620 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012621 }
Christian Heimesa156e092008-02-16 07:38:31 +000012622 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012624
Walter Dörwald16807132007-05-25 13:52:07 +000012625void
12626PyUnicode_InternInPlace(PyObject **p)
12627{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012628 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12629 PyObject *t;
12630 if (s == NULL || !PyUnicode_Check(s))
12631 Py_FatalError(
12632 "PyUnicode_InternInPlace: unicode strings only please!");
12633 /* If it's a subclass, we don't really know what putting
12634 it in the interned dict might do. */
12635 if (!PyUnicode_CheckExact(s))
12636 return;
12637 if (PyUnicode_CHECK_INTERNED(s))
12638 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 if (PyUnicode_READY(s) == -1) {
12640 assert(0 && "ready fail in intern...");
12641 return;
12642 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012643 if (interned == NULL) {
12644 interned = PyDict_New();
12645 if (interned == NULL) {
12646 PyErr_Clear(); /* Don't leave an exception */
12647 return;
12648 }
12649 }
12650 /* It might be that the GetItem call fails even
12651 though the key is present in the dictionary,
12652 namely when this happens during a stack overflow. */
12653 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012655 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012656
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 if (t) {
12658 Py_INCREF(t);
12659 Py_DECREF(*p);
12660 *p = t;
12661 return;
12662 }
Walter Dörwald16807132007-05-25 13:52:07 +000012663
Benjamin Peterson14339b62009-01-31 16:36:08 +000012664 PyThreadState_GET()->recursion_critical = 1;
12665 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12666 PyErr_Clear();
12667 PyThreadState_GET()->recursion_critical = 0;
12668 return;
12669 }
12670 PyThreadState_GET()->recursion_critical = 0;
12671 /* The two references in interned are not counted by refcnt.
12672 The deallocator will take care of this */
12673 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012675}
12676
12677void
12678PyUnicode_InternImmortal(PyObject **p)
12679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12681
Benjamin Peterson14339b62009-01-31 16:36:08 +000012682 PyUnicode_InternInPlace(p);
12683 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012685 Py_INCREF(*p);
12686 }
Walter Dörwald16807132007-05-25 13:52:07 +000012687}
12688
12689PyObject *
12690PyUnicode_InternFromString(const char *cp)
12691{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012692 PyObject *s = PyUnicode_FromString(cp);
12693 if (s == NULL)
12694 return NULL;
12695 PyUnicode_InternInPlace(&s);
12696 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012697}
12698
Alexander Belopolsky40018472011-02-26 01:02:56 +000012699void
12700_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012701{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012702 PyObject *keys;
12703 PyUnicodeObject *s;
12704 Py_ssize_t i, n;
12705 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012706
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 if (interned == NULL || !PyDict_Check(interned))
12708 return;
12709 keys = PyDict_Keys(interned);
12710 if (keys == NULL || !PyList_Check(keys)) {
12711 PyErr_Clear();
12712 return;
12713 }
Walter Dörwald16807132007-05-25 13:52:07 +000012714
Benjamin Peterson14339b62009-01-31 16:36:08 +000012715 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12716 detector, interned unicode strings are not forcibly deallocated;
12717 rather, we give them their stolen references back, and then clear
12718 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012719
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 n = PyList_GET_SIZE(keys);
12721 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012723 for (i = 0; i < n; i++) {
12724 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 if (PyUnicode_READY(s) == -1)
12726 fprintf(stderr, "could not ready string\n");
12727 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 case SSTATE_NOT_INTERNED:
12729 /* XXX Shouldn't happen */
12730 break;
12731 case SSTATE_INTERNED_IMMORTAL:
12732 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012734 break;
12735 case SSTATE_INTERNED_MORTAL:
12736 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012738 break;
12739 default:
12740 Py_FatalError("Inconsistent interned string state.");
12741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 }
12744 fprintf(stderr, "total size of all interned strings: "
12745 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12746 "mortal/immortal\n", mortal_size, immortal_size);
12747 Py_DECREF(keys);
12748 PyDict_Clear(interned);
12749 Py_DECREF(interned);
12750 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012751}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012752
12753
12754/********************* Unicode Iterator **************************/
12755
12756typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757 PyObject_HEAD
12758 Py_ssize_t it_index;
12759 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012760} unicodeiterobject;
12761
12762static void
12763unicodeiter_dealloc(unicodeiterobject *it)
12764{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012765 _PyObject_GC_UNTRACK(it);
12766 Py_XDECREF(it->it_seq);
12767 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012768}
12769
12770static int
12771unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12772{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012773 Py_VISIT(it->it_seq);
12774 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012775}
12776
12777static PyObject *
12778unicodeiter_next(unicodeiterobject *it)
12779{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 PyUnicodeObject *seq;
12781 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012782
Benjamin Peterson14339b62009-01-31 16:36:08 +000012783 assert(it != NULL);
12784 seq = it->it_seq;
12785 if (seq == NULL)
12786 return NULL;
12787 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12790 int kind = PyUnicode_KIND(seq);
12791 void *data = PyUnicode_DATA(seq);
12792 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12793 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012794 if (item != NULL)
12795 ++it->it_index;
12796 return item;
12797 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012798
Benjamin Peterson14339b62009-01-31 16:36:08 +000012799 Py_DECREF(seq);
12800 it->it_seq = NULL;
12801 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012802}
12803
12804static PyObject *
12805unicodeiter_len(unicodeiterobject *it)
12806{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012807 Py_ssize_t len = 0;
12808 if (it->it_seq)
12809 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12810 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012811}
12812
12813PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12814
12815static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012816 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012817 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012819};
12820
12821PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12823 "str_iterator", /* tp_name */
12824 sizeof(unicodeiterobject), /* tp_basicsize */
12825 0, /* tp_itemsize */
12826 /* methods */
12827 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12828 0, /* tp_print */
12829 0, /* tp_getattr */
12830 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012831 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012832 0, /* tp_repr */
12833 0, /* tp_as_number */
12834 0, /* tp_as_sequence */
12835 0, /* tp_as_mapping */
12836 0, /* tp_hash */
12837 0, /* tp_call */
12838 0, /* tp_str */
12839 PyObject_GenericGetAttr, /* tp_getattro */
12840 0, /* tp_setattro */
12841 0, /* tp_as_buffer */
12842 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12843 0, /* tp_doc */
12844 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12845 0, /* tp_clear */
12846 0, /* tp_richcompare */
12847 0, /* tp_weaklistoffset */
12848 PyObject_SelfIter, /* tp_iter */
12849 (iternextfunc)unicodeiter_next, /* tp_iternext */
12850 unicodeiter_methods, /* tp_methods */
12851 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012852};
12853
12854static PyObject *
12855unicode_iter(PyObject *seq)
12856{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012857 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012858
Benjamin Peterson14339b62009-01-31 16:36:08 +000012859 if (!PyUnicode_Check(seq)) {
12860 PyErr_BadInternalCall();
12861 return NULL;
12862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 if (PyUnicode_READY(seq) == -1)
12864 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012865 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12866 if (it == NULL)
12867 return NULL;
12868 it->it_index = 0;
12869 Py_INCREF(seq);
12870 it->it_seq = (PyUnicodeObject *)seq;
12871 _PyObject_GC_TRACK(it);
12872 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012873}
12874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875#define UNIOP(x) Py_UNICODE_##x
12876#define UNIOP_t Py_UNICODE
12877#include "uniops.h"
12878#undef UNIOP
12879#undef UNIOP_t
12880#define UNIOP(x) Py_UCS4_##x
12881#define UNIOP_t Py_UCS4
12882#include "uniops.h"
12883#undef UNIOP
12884#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012885
Victor Stinner71133ff2010-09-01 23:43:53 +000012886Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012887PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012888{
12889 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12890 Py_UNICODE *copy;
12891 Py_ssize_t size;
12892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 if (!PyUnicode_Check(unicode)) {
12894 PyErr_BadArgument();
12895 return NULL;
12896 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012897 /* Ensure we won't overflow the size. */
12898 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12899 PyErr_NoMemory();
12900 return NULL;
12901 }
12902 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12903 size *= sizeof(Py_UNICODE);
12904 copy = PyMem_Malloc(size);
12905 if (copy == NULL) {
12906 PyErr_NoMemory();
12907 return NULL;
12908 }
12909 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12910 return copy;
12911}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012912
Georg Brandl66c221e2010-10-14 07:04:07 +000012913/* A _string module, to export formatter_parser and formatter_field_name_split
12914 to the string.Formatter class implemented in Python. */
12915
12916static PyMethodDef _string_methods[] = {
12917 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12918 METH_O, PyDoc_STR("split the argument as a field name")},
12919 {"formatter_parser", (PyCFunction) formatter_parser,
12920 METH_O, PyDoc_STR("parse the argument as a format string")},
12921 {NULL, NULL}
12922};
12923
12924static struct PyModuleDef _string_module = {
12925 PyModuleDef_HEAD_INIT,
12926 "_string",
12927 PyDoc_STR("string helper module"),
12928 0,
12929 _string_methods,
12930 NULL,
12931 NULL,
12932 NULL,
12933 NULL
12934};
12935
12936PyMODINIT_FUNC
12937PyInit__string(void)
12938{
12939 return PyModule_Create(&_string_module);
12940}
12941
12942
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012943#ifdef __cplusplus
12944}
12945#endif